算法之前用python试过一遍,里面有基本原理
首先简单实现一个mat类型,用AVX指令加速,std::allocator管理内存
//mat.h
#pragma once
#include <immintrin.h>
#include<initializer_list>
#include<iostream>
#include<random>
#include<xmemory>
static std::allocator<float> alloc;
struct mat
{
int size = 0;
int row = 0;
int col = 0;
float* ptr = nullptr;
mat() = default;
mat(int row, int col, double value){
this->row = row;
this->col = col;
size = row * col;
ptr = alloc.allocate(size);
float val = float(value);
std::fill(ptr, ptr + size, val);
}
mat(std::initializer_list<std::initializer_list<float>> lists){
row = int(lists.size());
col = int(lists.begin()->size());
size = row * col;
ptr = alloc.allocate(size);
int idx = 0;
for (auto& list : lists) {
for (auto& e : list) {
*(ptr + idx) = e;
idx++;
}
}
}
mat(const mat& another){
row = another.row;
col = another.col;
size = another.size;
if (ptr != nullptr) {
alloc.deallocate(ptr, size);
}
ptr = alloc.allocate(size);
std::copy(another.ptr, another.ptr + size, ptr);
}
mat(mat&& another)noexcept{
if (this == &another)return;
row = another.row;
col = another.col;
size = another.size;
if (ptr != nullptr) {
alloc.deallocate(ptr, size);
}
ptr = another.ptr;
another.row = another.col = another.size = 0;
another.ptr = nullptr;
}
mat& operator =(const mat& another){
if (this == &another)return *this;
row = another.row;
col = another.col;
size = row * col;
if (ptr != nullptr) {
alloc.deallocate(ptr, size);
}
ptr = alloc.allocate(size);
std::copy(another.ptr, another.ptr + size, ptr);
return *this;
}
mat& operator =(mat&& another)noexcept{
if (this == &another)return *this;
row = another.row;
col = another.col;
size = another.size;
if (ptr != nullptr) {
alloc.deallocate(ptr, size);
}
ptr = another.ptr;
another.row = another.col = another.size = 0;
another.ptr = nullptr;
return *this;
}
float& operator ()(int row, int col){
return ptr[this->col * row + col];
}
~mat(){
if (ptr != nullptr) {
alloc.deallocate(ptr, size);
}
}
void show() {
printf("\n");
for (int i = 0; i < row; ++i) {
for (int j = 0; j < col; ++j) {
printf("%.4f ", ptr[i * col + j]);
}
printf("\n");
}
}
};
inline mat rand(int row,int col){
mat res(row,col, 0);
std::mt19937 engine(std::random_device{}());
std::normal_distribution<float> distribution(0.0,sqrt(1.0/col));
float* pCur = res.ptr;
for (int i = 0; i < res.size; ++i) {
*pCur = distribution(engine);
pCur++;
}
return res;
}
inline mat matMul(mat a, mat b) {
if (b.row != a.col) throw("error");
mat res(a.row, b.col, 0);
for (int i = 0; i < a.row; ++i) {
for (int k = 0; k < a.col; ++k) {
__m256 va = _mm256_set1_ps(a.ptr[i * a.col + k]);
int j = 0;
for (; j < b.col - 7; j += 8) {
__m256 vb = _mm256_loadu_ps(b.ptr + k * b.col + j);
__m256 vr = _mm256_loadu_ps(res.ptr + i * res.col + j);
vr = _mm256_fmadd_ps(va, vb, vr);
_mm256_storeu_ps(res.ptr + i * res.col + j, vr);
//res(i, j) += a(i, k) * b(k, j);
}
for (; j < b.col; ++j) {
res.ptr[i * res.col + j] += a.ptr[i * a.col + k] * b.ptr[k * b.col + j];
}
}
}
return res;
}
inline mat sigmoid(const mat& t){
mat res(t.row, t.col, 0);
int i;
for (i = 0; i < t.size - 7; i += 8) {
__m256 vecT = _mm256_loadu_ps(t.ptr + i);
__m256 one = _mm256_set1_ps(1.0f);
__m256 negVecT = _mm256_sub_ps(_mm256_setzero_ps(), vecT);
__m256 expNegVecT = _mm256_exp_ps(negVecT);
__m256 sum = _mm256_add_ps(one, expNegVecT);
__m256 invSum = _mm256_div_ps(one, sum);
_mm256_storeu_ps(res.ptr + i, invSum);
}
for (; i < t.size; ++i) {
res.ptr[i] = 1.0f /(1 + exp(-t.ptr[i]));
}
return res;
}
inline mat d_sigmoid(const mat& t){
mat res(t.row, t.col, 0);
float* pRes = res.ptr;
float* pA = t.ptr;
int i;
for (i = 0; i < t.size - 7; i += 8) {
__m256 vecT = _mm256_loadu_ps(t.ptr + i);
__m256 one = _mm256_set1_ps(1.0f);
__m256 negVecT = _mm256_sub_ps(_mm256_setzero_ps(), vecT);
__m256 expNegVecT = _mm256_exp_ps(negVecT);
__m256 sum = _mm256_add_ps(one, expNegVecT);
__m256 invSum = _mm256_div_ps(one, sum);
__m256 one_sub_sum = _mm256_sub_ps(one, invSum);
__m256 targetVal = _mm256_mul_ps(invSum, one_sub_sum);
_mm256_storeu_ps(res.ptr + i, targetVal);
}
for (; i < t.size; ++i) {
float u = 1 / (1 + std::exp(-t.ptr[i]));
res.ptr[i] = u * (1 - u);
}
return res;
}
inline mat transpose(mat& t){
mat res(t.col, t.row, 0);
const int dt = t.col;
float* pRes = res.ptr;
for (int i = 0; i < res.row; ++i) {
float* pT = t.ptr + i;
for (int j = 0; j < res.col; ++j) {
*pRes = *pT;
pRes++; pT += dt;
}
}
return res;
}
inline mat add(mat a, mat b){
if (a.row != b.row || a.col != b.col) throw("error");
mat res(a.row, a.col, 0);
__m256 va, vb, vr;
int i = 0;
for (; i< a.size - 7; i += 8) {
va = _mm256_loadu_ps(a.ptr + i);
vb = _mm256_loadu_ps(b.ptr + i);
vr = _mm256_add_ps(va, vb);
_mm256_storeu_ps(res.ptr + i, vr);
}
for (; i < a.size; i++) {
res.ptr[i] = a.ptr[i] + b.ptr[i];
}
return res;
}
inline mat sub(mat a, mat b){
if (a.row != b.row || a.col != b.col) throw("error");
mat res(a.row, a.col, 0);
__m256 va, vb, vr;
int i = 0;
for(; i < a.size - 7; i += 8) {
va = _mm256_loadu_ps(a.ptr + i);
vb = _mm256_loadu_ps(b.ptr + i);
vr = _mm256_sub_ps(va, vb);
_mm256_storeu_ps(res.ptr + i, vr);
}
for (; i < a.size; i++) {
res.ptr[i] = a.ptr[i] - b.ptr[i];
}
return res;
}
inline mat numMul(mat& t ,double data) {
float val = float(data);
mat res(t.row, t.col, 0);
__m256 vecV = _mm256_set1_ps(val);
int i = 0;
for (; i < t.size - 7; i += 8) {
__m256 vecT = _mm256_loadu_ps(t.ptr + i);
__m256 vecR = _mm256_mul_ps(vecT, vecV);
_mm256_storeu_ps(res.ptr + i, vecR);
}
for (; i < t.size; i++) {
res.ptr[i] = t.ptr[i] * val;
}
return res;
}
inline mat dot(mat a, mat b) {
if (a.row != b.row || a.col != b.col) throw("error");
mat res(a.row, a.col, 0);
__m256 va, vb, vr;
int i = 0;
for (; i < a.size - 7; i += 8) {
va = _mm256_loadu_ps(a.ptr + i);
vb = _mm256_loadu_ps(b.ptr + i);
vr = _mm256_mul_ps(va, vb);
_mm256_storeu_ps(res.ptr + i, vr);
}
for (; i < a.size; ++i) {
res.ptr[i] = a.ptr[i] * b.ptr[i];
}
return res;
}
然后是csv文件读写
//file.h
#pragma once
#include<string>
#include<vector>
#include<fstream>
#include"mat.h"
struct batch
{
mat input;
mat output;
batch(mat&& in, mat&& out) {
input = in;
output = out;
}
};
std::vector<batch> trainData;
std::vector<batch> testData;
inline std::vector<float> split(std::string& line, char delimiter) {
line += '\n';//读进来没换行符了,记得加上
std::vector<float> tokens;
std::string token;
for (int i = 0; i < line.size(); ++i) {
if (line[i] == '\n' || line[i] == ',') {
tokens.emplace_back(std::stof(token));
token.assign("");
}
else {
token += line[i];
}
}
return tokens;
}
inline std::vector<std::vector<float>> readCSV(const std::string& filename) {
std::ifstream file(filename);
std::vector<std::vector<float>> data;
if (file.is_open()) {
std::string line;
while (std::getline(file, line)) {
data.emplace_back(split(line, ','));
}
}
return data;
}
inline mat to_mat_train(std::vector<float>& data) {
mat res(1, 784, 0);
for (int i = 0; i < 784; ++i) {
res.ptr[i] = (data[i + 1] +1.0f) /257.0f;
//归一化
}
return res;
}
inline mat to_mat_test(std::vector<float>& data) {
mat res(1, 10, 0);
int idx = data[0];
res.ptr[idx] = 0.99f;
return res;
}
void getData(const std::string& filename_train, const std::string& filename_test) {
printf("start reading data\n");
std::vector<std::vector<float>> train_data = readCSV(filename_train);
std::vector<std::vector<float>> test_data = readCSV(filename_test);
for (auto& e : train_data) {
trainData.emplace_back(
batch(to_mat_train(e),
to_mat_test(e))
);
}
for (auto& e : test_data) {
testData.emplace_back(
batch(to_mat_train(e),
to_mat_test(e))
);
}
printf("end reading data\n");
printf("trainData size %lld\n",train_data.size());
printf("testData size %lld\n",test_data.size());
}
最后实现的是之前写过的算法,不过这个版本没写偏置
//neu.h
class Dense {
friend class NeuNet;
private:
mat input;
mat output;
mat active_output;
mat net;
mat dW;
mat error;
public:
Dense() = default;
Dense(int inputSize, int outputSize) {
net = rand(inputSize, outputSize);
}
mat forward(mat& inputData){
input = std::move(inputData);
output = matMul(input, net);
active_output = sigmoid(output);
return active_output;
}
//注意符号顺序 error = output - target
mat cacuError(mat& error) {
this->error = std::move(error);
mat prev_error = matMul(this->error, transpose(net));
return prev_error;
}
void backward(float lr) {
dW = matMul(transpose(input), dot(error, d_sigmoid(output)));
net = sub(net, numMul(dW, lr));
}
};
class NeuNet {
private:
mat inputData;
mat targetData;
std::vector<Dense> dense;
float lr;
public:
void addDense(int inputSize, int outputSize) {
dense.emplace_back(Dense(inputSize, outputSize));
}
void addData(mat& inputMat, mat& targetMat) {
inputData = inputMat;
targetData = targetMat;
}
void setLr(double learningRate) {
lr = float(learningRate);
}
void forward() {
mat fwd = inputData;
for (size_t i = 0; i < dense.size(); ++i) {
fwd = dense[i].forward(fwd);
}
}
void backward() {
int backIdx = int(dense.size() - 1);
mat err = sub(dense[backIdx].active_output, targetData);
for (int i = backIdx; i >= 0; --i) {
err = dense[i].cacuError(err);
dense[i].backward(this->lr);
}
}
mat querry(mat& inputMat) {
mat fwd = inputMat;
for (size_t i = 0; i < dense.size(); ++i) {
fwd = dense[i].forward(fwd);
}
return fwd;
}
};
主函数调用
//main.cpp
#include<iostream>
#include<Windows.h>
#include"neu.h"
#include"file.h"
inline int findMaxIdx(mat& t) {
float maxVal = 0;
int maxIdx = 0;
for (int i = 0; i < t.size; ++i) {
if (t.ptr[i] > maxVal) {
maxVal = t.ptr[i];
maxIdx = i;
}
}
return maxIdx;
}
int main() {
NeuNet net;
net.setLr(0.02);
net.addDense(784, 100);
net.addDense(100, 10);
getData("E:\\dev\\neuralNetwork\\mnist\\mnist_train.csv",
"E:\\dev\\neuralNetwork\\mnist\\mnist_test.csv");
int s = clock();
for (size_t i = 0; i < trainData.size(); i++) {
net.addData(trainData[i].input, trainData[i].output);
net.forward();
net.backward();
if ((i + 1) % 1000 == 0) {
printf("batch %5lld/%5lld finished\n", (i + 1),trainData.size());
}
}
int e = clock();
float meet = 0;
for (size_t i = 0; i < testData.size(); i++) {
mat tmp = net.querry(testData[i].input);
if (findMaxIdx(tmp) == findMaxIdx(testData[i].output)) {
meet += 1;
}
}
float accuracy = meet / float(testData.size());
printf("final accuracy %.4f\n", accuracy);
printf("training had used %d ms", e - s);
}