我正在用 C++ 从头开始制作一个用于学习目的的神经网络,并尝试让我的神经网络学习手写数字的 MNIST 数据集。然而,每次我拟合神经网络时,网络 MSE 只输出一个值。我不知道这是否与我编写
NeuralNetwok
类的方式或我拟合数据的方式有关。这些是重现问题的相关代码:
// PredictedData.hpp
#pragma once
#include <stdint.h>
struct PredictedData
{
uint64_t m_NeuronIndex;
double m_NeuronValue;
PredictedData(
uint64_t neuronIndex,
double neuronValue);
~PredictedData();
};
// PredictedData.cpp
#include "PredictData.hpp"
PredictedData::PredictedData(
uint64_t neuronIndex,
double neuronValue) : m_NeuronIndex(neuronIndex),
m_NeuronValue(neuronValue) {}
PredictedData::~PredictedData() {}
// NeuralNetwork.hpp
#pragma once
#include "PredictData.hpp"
#include <iostream>
#include <stdint.h>
#include <vector>
class NeuralNetwork
{
private:
void CalculateOutputs(const std::vector<double> &inputData);
void CalculateDeltas(const std::vector<double> &expectedData);
void ApplyGradientDescent(
const std::vector<double> &inputData,
const double &learningRate);
public:
std::vector<uint64_t> m_Topology;
std::vector<std::vector<double>> m_CachedDeltas;
std::vector<std::vector<double>> m_CachedSums;
std::vector<std::vector<double>> m_CachedOutputs;
std::vector<std::vector<double>> m_Biases;
std::vector<std::vector<std::vector<double>>> m_Weights;
NeuralNetwork(const std::vector<uint64_t> &topology);
~NeuralNetwork();
void Fit(
const std::vector<double> &trainingData,
const std::vector<double> &expectedData,
const double &learningRate = 0.001);
double GetMSE(const std::vector<double> &expectedData);
PredictedData Predict(const std::vector<double> &inputData);
};
// NeuralNetwork.cpp
#include "NeuralNetwork.hpp"
#include <cmath>
#include <random>
double SigmoidActivation(const double &x)
{
return 1 / (1 + std::exp(-x));
}
double SigmoidActivationDerivative(const double &x)
{
return SigmoidActivation(x) * (1 - SigmoidActivation(x));
}
double SigmoidActivationDerivativeForOutput(const double &y)
{
return y * (1 - y);
}
double TanhActivation(const double &x)
{
return std::tanh(x);
// return 2 / (1 + std::exp(-2 * x)) - 1;
}
double TanhActivationDerivative(const double &x)
{
const double factor = TanhActivation(x);
return 1 - factor * factor;
}
double TanhActivationDerivativeForOutput(const double &y)
{
return 1 - y * y;
}
double ReLUActivation(const double &x)
{
return x < 0 ? 0 : x;
}
double ReLUActivationDerivative(const double &x)
{
return x < 0 ? 0 : 1;
}
double ParametricReLUActivation(const double &x, const double &a)
{
return x < 0 ? 0 : a * x;
}
double ParametricReLUActivationDerivative(const double &x, const double &a)
{
return x < 0 ? a : 1;
}
double ELUActivation(const double &x, const double &a)
{
return x < 0 ? 0 : a * (std::exp(x) - 1);
}
double ELUActivationDerivative(const double &x, const double &a)
{
return x < 0 ? ELUActivation(x, a) + a : 1;
}
double ELUActivationDerivativeForOutput(const double &x, const double &y, const double &a)
{
return x < 0 ? y + a : 1;
}
double SoftPlusActivation(const double &x)
{
return std::log(1 + std::exp(x));
}
double SoftPlusActivationDerivative(const double &x)
{
return 1 / (1 + std::exp(-x));
}
double Cost(
const double &predicted,
const double &actual)
{
double error = actual - predicted;
return error * error;
}
double CostDerivative(
const double &predicted,
const double &actual)
{
return 2 * (actual - predicted);
}
NeuralNetwork::NeuralNetwork(const std::vector<uint64_t> &topology) : m_Topology(topology)
{
// reserve for layers
m_Biases.reserve(topology.size() - 1);
m_Weights.reserve(topology.size() - 1);
m_CachedDeltas.reserve(topology.size() - 1);
m_CachedSums.reserve(topology.size() - 1);
m_CachedOutputs.reserve(topology.size() - 1);
for (uint64_t i = 1; i < topology.size(); i++)
{
// IMPORTANT! this opeartion will used throughout
// topology is inputed as following {input layer, hidden layer 1, hidden layer 2, hidden layer n, output layer}
// we don't need to create memory for the input layer as it will be passed by the user
// that's why the real index with relative to topology index is topology index - 1
const uint64_t l = i - 1;
// reserve for neuron in layer i
m_CachedSums.push_back(std::vector<double>());
m_CachedSums[l].reserve(topology[i]);
// reserve for neuron in layer i
m_CachedOutputs.push_back(std::vector<double>());
m_CachedOutputs[l].reserve(topology[i]);
// reserve for neuron in layer i
m_CachedDeltas.push_back(std::vector<double>());
m_CachedDeltas[l].reserve(topology[i]);
// reserve for neuron in layer i
m_Biases.push_back(std::vector<double>());
m_Biases[l].reserve(topology[i]);
// reserve for neuron in layer i
m_Weights.push_back(std::vector<std::vector<double>>());
m_Weights[l].reserve(topology[i]);
for (uint64_t j = 0; j < topology[i]; j++)
{
// initialize for neuron j in layer i
m_CachedSums[l].push_back(0);
m_CachedOutputs[l].push_back(0);
m_CachedDeltas[l].push_back(0);
m_Biases[l].push_back(0);
// reserve for layer l that affects neuron j
m_Weights[l].push_back(std::vector<double>());
m_Weights[l][j].reserve(topology[l]);
for (uint64_t z = 0; z < topology[l]; z++)
{
// initialize weights
m_Weights[l][j].push_back((double)std::rand() / RAND_MAX * sqrt(1.0 / topology[l]));
}
}
}
}
NeuralNetwork::~NeuralNetwork()
{
}
void NeuralNetwork::CalculateOutputs(const std::vector<double> &inputData)
{
// layer after input layer
for (uint64_t i = 0; i < m_Topology[1]; i++)
{
m_CachedSums.front()[i] = m_Biases.front()[i];
// product = output * weight (some way connected to the output)
// calculated the products of all neurons of the input layer that is connected to neuron i and sum them
for (uint64_t j = 0; j < m_Topology.front(); j++)
{
m_CachedSums.front()[i] += inputData[j] * m_Weights.front()[i][j];
}
// attempt to work with different activation function
m_CachedOutputs.front()[i] = ReLUActivation(m_CachedSums.front()[i]);
// m_CachedOutputs.front()[i] = SigmoidActivation(m_CachedSums.front()[i]);
// m_CachedOutputs.front()[i] = TanhActivation(m_CachedSums.front()[i]);
}
// more hidden layer
for (uint64_t i = 2; i < m_Topology.size() - 1; i++)
{
const uint64_t l = i - 1;
for (uint64_t j = 0; j < m_Topology[i]; j++)
{
m_CachedSums[l][j] = m_Biases[l][j];
// product = output * weight (some way connected to the output)
// calculated the products of all neurons of the prev layer that is connected to neuron i and sum them
for (uint64_t z = 0; z < m_Topology[l]; z++)
{
m_CachedSums[l][j] += m_CachedOutputs[l - 1][z] * m_Weights[l][j][z];
}
m_CachedOutputs[l][i] = ReLUActivation(m_CachedSums[l][i]);
// m_CachedOutputs[l][i] = SigmoidActivation(m_CachedSums[l][i]);
// m_CachedOutputs[l][j] = TanhActivation(m_CachedSums[l][j]);
}
}
// output layer
for (uint64_t i = 0; i < m_Topology.back(); i++)
{
const uint64_t l = m_Topology.size() - 2;
m_CachedSums.back()[i] = m_Biases.back()[i];
// product = output * weight (some way connected to the output)
// calculated the products of all neurons of the prev hidden layer that is connected to neuron i and sum them
for (uint64_t j = 0; j < m_Topology[l]; j++)
{
m_CachedSums.back()[i] += m_CachedOutputs[l - 1][j] * m_Weights.back()[i][j];
}
// attempt to work with different activation function for output layer
// m_CachedOutputs.back()[i] = ReLUActivation(m_CachedSums.back()[i]);
// m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
m_CachedOutputs.back()[i] = SigmoidActivation(m_CachedSums.back()[i]);
}
}
void NeuralNetwork::CalculateDeltas(const std::vector<double> &expectedData)
{
// output layer
for (uint64_t i = 0; i < m_Topology.back(); i++)
{
// attempt to work with different activation function derivatiove for output layer
m_CachedDeltas.back()[i] = SigmoidActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
// m_CachedDeltas.back()[i] = SoftPlusActivationDerivative(m_CachedSums.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
// m_CachedDeltas.back()[i] = SoftPlusActivationDerivativeForOutput(m_CachedOutputs.back()[i]) * CostDerivative(m_CachedOutputs.back()[i], expectedData[i]) / m_Topology.back();
}
// hidden layers
for (uint64_t i = m_Topology.size() - 2; i > 0; i--)
{
const uint64_t l = i - 1;
for (uint64_t j = 0; j < m_Topology[i]; j++)
{
// initialize delta to 0
m_CachedDeltas[l][j] = 0;
// sum all deltas of the next layer of layer i multiplied by the weights connected between neurons in the next layer with the neuron j
for (uint64_t z = 0; z < m_Topology[i + 1]; z++)
{
m_CachedDeltas[l][j] += m_CachedDeltas[i][z] * m_Weights[i][z][j];
}
// attempt to work with different activation function derivatiove for hidden layer
// m_CachedDeltas[l][j] *= TanhActivationDerivativeForOutput(m_CachedOutputs[l][j]);
// m_CachedDeltas[l][j] *= SigmoidActivationDerivativeForOutput(m_CachedOutputs[l][j]);
m_CachedDeltas[l][j] *= ReLUActivationDerivative(m_CachedSums[l][j]);
// m_CachedDeltas[l][j] *= SoftPlusActivationDerivative(m_CachedSums[l][j]);
}
}
}
void NeuralNetwork::ApplyGradientDescent(const std::vector<double> &inputData, const double &learningRate)
{
for (uint64_t i = 0; i < m_Topology[1]; i++)
{
// common operation to minimize redundancy
double net = learningRate * m_CachedDeltas.front()[i];
// change bias by net * 1
m_Biases.front()[i] -= net;
for (uint64_t j = 0; j < m_Topology.front(); j++)
{
// change weight by net * input
m_Weights.front()[i][j] -= net * inputData[j];
}
}
for (uint64_t i = 2; i < m_Topology.size(); i++)
{
const uint64_t l = i - 1;
for (uint64_t j = 0; j < m_Topology[i]; j++)
{
// common operation to minimize redundancy
double net = learningRate * m_CachedDeltas[l][j];
// change bias by net * 1
m_Biases[l][j] -= net;
for (uint64_t z = 0; z < m_Topology[l]; z++)
{
// change weight by net * prev layer output
m_Weights[l][j][z] -= net * m_CachedOutputs[l - 1][z];
}
}
}
}
double NeuralNetwork::GetMSE(const std::vector<double> &expectedData)
{
double SumSE = 0;
for (uint64_t i = 0; i < m_Topology.back(); i++)
{
SumSE += Cost(m_CachedOutputs.back()[i], expectedData[i]);
}
return SumSE / (double)m_Topology.back();
}
void NeuralNetwork::Fit(const std::vector<double> &trainingData, const std::vector<double> &expectedData, const double &learningRate)
{
CalculateOutputs(trainingData);
CalculateDeltas(expectedData);
ApplyGradientDescent(trainingData, learningRate);
}
PredictedData NeuralNetwork::Predict(const std::vector<double> &inputData)
{
CalculateOutputs(inputData);
PredictedData predictedData(-1, -1000000);
for (uint64_t i = 0; i < m_Topology.back(); i++)
{
if (m_CachedOutputs.back()[i] <= predictedData.m_NeuronValue)
{
continue;
}
predictedData.m_NeuronIndex = i;
predictedData.m_NeuronValue = m_CachedOutputs.back()[i];
}
return predictedData;
}
// Main.cpp
#include "NeuralNetwork.hpp"
#include <cmath>
#include <fstream>
#include <string>
std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes);
template <typename T>
std::ifstream &Read(std::ifstream &in, T &data)
{
return Read(in, &data, sizeof(T));
}
uint32_t SwapEndian(uint32_t val);
bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns);
bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data);
#define LEARNING_RATE 0.1
int main()
{
std::ifstream imageFile("F:\\DATA\\mnist\\train-images.idx3-ubyte", std::ios::binary);
if (!imageFile.is_open())
{
std::cout << "couldn't open image file\n";
return 1;
}
std::ifstream labelFile("F:\\DATA\\mnist\\train-labels.idx1-ubyte", std::ios::binary);
if (!labelFile.is_open())
{
std::cout << "couldn't open label file\n";
return 1;
}
uint64_t rows;
uint64_t columns;
std::vector<std::vector<double>> imageData;
if (!ReadMNISTImages(imageFile, imageData, rows, columns))
{
// failed
return 1;
}
imageFile.close();
std::vector<uint8_t> labelData;
if (!ReadMNISTLabels(labelFile, labelData))
{
// failed
return 1;
}
labelFile.close();
uint64_t correct = 0;
uint64_t wrong = 0;
// init network
NeuralNetwork network({rows * columns, 16, 16, 10});
for (uint64_t i = 0; i < labelData.size() / 6; i++)
{
const uint8_t label = labelData[i];
std::vector<double> expectedData(10, 0);
// set index label to 1 (hot encoding)
expectedData[label] = 1;
// fit
network.Fit(imageData[i], expectedData, LEARNING_RATE);
// log MSE every 1000 fit
if (i % 1000 == 0)
{
std::cout << "MSE: " << network.GetMSE(expectedData) << '\n';
}
// predict
PredictedData predictedData = network.Predict(imageData[i]);
// check if the predicted neuron index is the same with the label
if (predictedData.m_NeuronIndex == label)
{
correct++;
}
else
{
wrong++;
}
}
// log the amount of correct and wrong predictions
std::cout << "correct: " << correct << ", wrong: " << wrong << ", total: " << wrong + correct << std::endl;
return 0;
}
std::ifstream &Read(std::ifstream &in, void *data, std::streamsize bytes)
{
char *buffer = reinterpret_cast<char *>(data);
return static_cast<std::ifstream &>(in.read(buffer, bytes));
}
uint32_t SwapEndian(uint32_t val)
{
val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
return (val << 16) | (val >> 16);
}
bool ReadMNISTImages(std::ifstream &file, std::vector<std::vector<double>> &data, uint64_t &rows, uint64_t &columns)
{
uint32_t magicNumber;
uint32_t numberOfImages;
uint32_t numberOfRows;
uint32_t numberOfColumns;
Read(file, &magicNumber, 4);
magicNumber = SwapEndian(magicNumber);
if (magicNumber != 2051)
{
std::cout << "Incorrect image file magicNumber: " << magicNumber << std::endl;
return false;
}
Read(file, &numberOfImages, 4);
numberOfImages = SwapEndian(numberOfImages);
Read(file, &numberOfRows, 4);
numberOfRows = SwapEndian(numberOfRows);
rows = numberOfRows;
Read(file, &numberOfColumns, 4);
numberOfColumns = SwapEndian(numberOfColumns);
columns = numberOfColumns;
data.reserve(numberOfImages);
for (uint64_t i = 0; i < numberOfImages; i++)
{
data.push_back(std::vector<double>());
data[i].reserve(numberOfRows * numberOfColumns);
for (uint64_t j = 0; j < numberOfRows * numberOfColumns; j++)
{
uint8_t pixel;
Read(file, &pixel, 1);
data[i].push_back((double)pixel / 255);
}
}
return true;
}
bool ReadMNISTLabels(std::ifstream &file, std::vector<uint8_t> &data)
{
uint32_t magicNumber;
uint32_t numberOfLabels;
Read(file, &magicNumber, 4);
magicNumber = SwapEndian(magicNumber);
if (magicNumber != 2049)
{
std::cout << "Incorrect label file magicNumber: " << magicNumber << std::endl;
return false;
}
Read(file, &numberOfLabels, 4);
numberOfLabels = SwapEndian(numberOfLabels);
data.reserve(numberOfLabels);
for (uint64_t i = 0; i < numberOfLabels; i++)
{
uint8_t pixel;
Read(file, pixel);
data.push_back(pixel);
}
return true;
}
这些是 MSE 输出:
MSE: 0.898595
MSE: 0.899916
MSE: 0.899992
MSE: 0.899984
MSE: 0.899999
MSE: 0.9
MSE: 0.899999
MSE: 0.899999
MSE: 0.9
MSE: 0.9
correct: 979, wrong: 9021, total: 10000
我一开始以为是激活函数的错误使用导致了这个错误,但即使我将激活函数更改为不同的函数,输出仍然是相同的。更改隐藏层的拓扑时也是同样的情况。此外,我更改了学习率,它对 MSE 有一些影响,但仍与 MSE 大致相同。我不知道为什么仍然一次又一次地给出相同的输出。我预计每次拟合后 MSE 都会下降。此外,当多次运行具有不同初始化的同一神经网络时,正确预测的数量和错误预测的数量以及每次拟合的 MSE 保持相同。
ApplyGradientDescent
方法有问题还是我疯了。请帮忙解决这个问题并解释一下出了什么问题。
问题在于成本导数中缺少一个负号。添加这个简单的标志后,问题实际上就消失了。感谢@Alexey S. Larionov 指出我的错误。