我正在尝试在 matlab 中实现 levenberg marquardt 算法,但我的代码没有收敛,并且在第一次迭代后错误不断变得越来越大。我检查了雅可比表、反函数等,但我无法弄清楚问题出在哪里。
我寻找雅可比并通过使用数值微分和计算偏导数公式以及通过矩阵运算来计算。
我尝试构建反函数和乔列斯基分解 我使用 Excel 表格和公式在代码中计算了矩阵和偏差计算,并检查了值。
在计算雅可比并找到 delta_value 并在第二次或第三次迭代中更改权重后,代码在第一个 bu 处找到较低的残差误差,残差范数变得越来越大。
有什么我看不到的吗?
clear; clc;
% Define hyperparameters
epoch = 10; % Maximum number of epochs
tol = 0.02; % Tolerance for convergence
lambda = 0.01; % Initial regularization parameter
layer_number = 3; % number of layer(input and output are included)
input_size= 1; % Number of input features
hidden_size = 15; % Number of neurons in the hidden layer
output_size = 1; % Number of outputs
num_runs = 5; % Number of runs to calculate average
number_of_sample=46; % Number of sample
% Create training data for problem one Sinus wave y = 1/2 + 1/4sin(37pix)
input_vals = linspace(-1, 1, number_of_sample)'; % 40 inputs scattered in the interval [-1, 1]
target_vals = 0.5 + 0.25 * sin(3 * pi * input_vals); % Target output according to given formula
% Run the algorithm 5 times for calculating average error to create plot
errors_of_epochs_for_average_5_run = zeros(epoch,1);
for run = 1:num_runs
W1 = rand(hidden_size,input_size);
b1 = rand(hidden_size,1);
W2 = rand(output_size,hidden_size);
b2 = rand(output_size,1);
parameterVector= [W1(:);b1(:);W2(:);b2]; %parameter vector for jacobian matrix;
errors_of_epochs = zeros(epoch,1); % to store error values of each epoch
for epoch_number = 1:epoch
% Initialize matrix to store error vectors for each run
output_vals=zeros(number_of_sample,output_size); %outputs for all samples
errors_for_each=zeros(number_of_sample,output_size); %error values for each input output value
%forwardpass loop
for i=1:number_of_sample
output_vals(i,output_size) = forward_pass(input_vals(i,input_size), W1, b1, W2, b2);
end
errors_for_each = target_vals - output_vals;
error_norm = norm(errors_for_each);
errors_of_epochs(epoch_number,1) = error_norm; % calculate average sum of square for each epoch
***parameterVector= [W1(:);b1(:);W2(:);b2]; % recreate paramVector with new weights***
Jaco=jacobian(parameterVector, target_vals, input_vals,hidden_size); % calculate jacobian
while true % to calculate delta again when error is bigger than previous one
JTJ=Jaco'*Jaco;
I=eye(size(JTJ));
JTJ_Lambda_Eye=JTJ+lambda*I;
JTe=Jaco'*errors_for_each;
try
[R,Flag]=chol(JTJ_Lambda_Eye);
if Flag
disp('Matrix must be symmetric and positive definite.');
end
% JTJLEyeInverse=inv(JTJ_Lambda_Eye);
L = chol(JTJ_Lambda_Eye, 'lower');
n=size(JTJ_Lambda_Eye,1);
I=eye(n);
Y=zeros(n);
% Solve for Y using forward substitution
for i = 1:n
Y(:, i) = L \ I(:, i);
end
% Now solve L^T * X = Y for X
JTJLEyeInverse = L' \ Y;
catch
disp('Cholesky factorization failed.');
break;
end
deltaValue=JTJLEyeInverse*JTe;
parameterVectorTest=parameterVector;
parameterVectorTest=parameterVectorTest-deltaValue;
% check wheather the new error create better value.
W1_test = reshape(parameterVectorTest(1:hidden_size * input_size), hidden_size, input_size);
b1_test = reshape(parameterVectorTest(hidden_size * input_size + 1:hidden_size * input_size + hidden_size),hidden_size, input_size);
W2_test = reshape(parameterVectorTest(hidden_size * input_size + hidden_size + 1:end - 1), output_size, hidden_size);
b2_test = parameterVectorTest(end);
%test delta value whether they are too small for change or not
if norm(deltaValue)<tol;
disp("Parameter changes are too small to continue");
break;
end
%test feedforward
test_output_vals=zeros(number_of_sample,output_size);
for i=1:number_of_sample
test_output_vals(i,output_size) = forward_pass(input_vals(i,input_size), W1_test, b1_test, W2_test, b2_test);
end
test_errors_for_each = target_vals - test_output_vals;
test_error_norm = norm(test_errors_for_each);
if test_error_norm > errors_of_epochs(epoch_number,1)
lambda=lambda*10;
% if lambda>1000
% lambda=1000;
% end
else
break;
end
end %finis point of bigger error and increasing lambda
if test_error_norm <= errors_of_epochs(epoch_number,1)
W1=W1_test;
b1=b1_test;
W2=W2_test;
b2=b2_test;
lambda=max(lambda/10, 1e-7);
end
if test_error_norm <= tol
disp(['Converged at epoch ', num2str(epoch_number), ' in run ', num2str(run)]);
break;
end
end
errors_of_epochs_for_average_5_run = errors_of_epochs_for_average_5_run + errors_of_epochs; %to calculate average of 5 run
end
% Plotting the result for investigating
% Calculate the average error across all runs
avg_epoch_errors = mean(errors_of_epochs_for_average_5_run, 1);
% Plot average error vs. epochs
plot_target=zeros(number_of_sample ,1);
for i_test = 1:input_size
X_test = input_vals(i_test);
% Forward pass to get the current output
plot_target(i_test) = forward_pass(X_test, W1, b1, W2, b2);
% Calculate the current error
end
figure;
hold on;
%plot(1:biggest_last_epoch_number, avg_epoch_errors(1:biggest_last_epoch_number), '-o');
plot(input_vals, target_vals, 'r-o');
plot(input_vals, plot_target, 'b-o');
xlabel('Epoch');
ylabel('Average Error');
title('Average Error per Epoch across 5 Runs');
grid on;
hold off;
% Define forward pass function
function [outputf,A1] = forward_pass(X, W1, b1, W2, b2)
Z1 = (W1 * X) + b1; % Linear combination for hidden layer
A1 = sigmoid(Z1); % Sigmoid activation for hidden layer
Z2 = (W2 * A1) + b2; % Linear combination for output layer
outputf = Z2; % Output layer
end
% Sigmoid activation function
function cikti = sigmoid(x)
cikti = 1 ./ (1 + exp(-x));
end
% Derivative of the sigmoid function
function cikti = sigmoidDerivative(x)
cikti = sigmoid(x) .* (1 - sigmoid(x));
end
% Define function for calculating Jacobian
function jaco_return=jacobian(jparameterVector, jtarget_vals, jinput_vals,jhidden_size)
jaco_return=zeros(length(jtarget_vals), length(jparameterVector));
W1 = reshape(jparameterVector(1:jhidden_size * 1), jhidden_size, 1);
b1 = reshape(jparameterVector(jhidden_size * 1 + 1:jhidden_size * 1 + jhidden_size),jhidden_size, 1);
W2 = reshape(jparameterVector(jhidden_size * 1 + jhidden_size + 1:end - 1), 1, jhidden_size);
b2 = jparameterVector(end);
for i=1:length(jtarget_vals) % to creat each line of jacobian for each input output pair
[~ , jA1] = forward_pass(jinput_vals(i),W1,b1,W2,b2);
derivW1 = -1 * W2'.* sigmoidDerivative(jA1) *jinput_vals(i); %W1 partial derivatives
derivb1 = -1 * W2'.* sigmoidDerivative(jA1); %b1 partial derivatives
derivW2 = -1 * jA1; %W2 partial derivatives
derivb2 = -1; %b2 partial derivatives linear activation function
jaco_return(i,:)= [derivW1(:);derivb1(:);derivW2(:);derivb2]; %each line of jacobian matrix
end
epsilon=1e-10;
jaco_test=zeros(length(jtarget_vals), length(jparameterVector));
jparameterVector_pus=jparameterVector;
for jsatir=1:length(jtarget_vals)
for jsutun=1:length(jparameterVector)
outpus_test_normal=forward_pass(jinput_vals(jsatir),W1,b1,W2,b2);
jparameterVector_pus(jsutun)=jparameterVector(jsutun)+epsilon;
W1_test = reshape(jparameterVector_pus(1:jhidden_size * 1), jhidden_size, 1);
b1_test = reshape(jparameterVector_pus(jhidden_size * 1 + 1:jhidden_size * 1 + jhidden_size),jhidden_size, 1);
W2_test = reshape(jparameterVector_pus(jhidden_size * 1 + jhidden_size + 1:end - 1), 1, jhidden_size);
b2_test = jparameterVector_pus(end);
jparameterVector_pus(jsutun)=jparameterVector(jsutun);
output_vals_test_plus = forward_pass(jinput_vals(jsatir),W1_test,b1_test,W2_test,b2_test);
jaco_test(jsatir, jsutun)=((output_vals_test_plus- outpus_test_normal)/epsilon)*-1;
end
end
jaco_return=jaco_test;
end
function cikti = sigmoidDerivative(x)
cikti = sigmoid(x) .* (1 - sigmoid(x));
此代码部分必须更改为
function cikti = sigmoidDerivative(x)
cikti = x .* (1 - x);
因为在Jacobean函数中,“sigmodiDerivative()”函数是用已经用sigmoid()函数计算出的参数来调用的
[~ , jA1] = forward_pass(jinput_vals(i),W1,b1,W2,b2);
derivW1 = -1 * W2'.* sigmoidDerivative(jA1) *jinput_vals(i);
“jA1”已经从“sigmoid()”函数返回,因此重复不必要的突然增加和减少兰巴问题得到解决。