layers.h
#pragma once
#include "cudnn_wrapper.h"
template<const int k, const int c, const int fsize, const int pad, const int stride = 1>
class ConvLayer {
public:
void init(cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, cudnnTensorDescriptor_t yDesc) {
checkCUDNN(cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, fsize, fsize));
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
checkCUDNN(cudnnGetConvolutionForwardAlgorithm(handle, xDesc, wDesc, convDesc, yDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, wDesc, convDesc, yDesc, algo, &workSpaceSizeInBytes));
checkCudaErrors(cudaMalloc(&workSpace, workSpaceSizeInBytes));
}
int get_yh(const int h) {
return (h + 2 * pad - fsize) / stride + 1;
}
int get_yw(const int w) {
return (w + 2 * pad - fsize) / stride + 1;
}
void get_xdesc(cudnnTensorDescriptor_t xDesc, const int n, const int h, const int w) {
checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
}
void get_ydesc(cudnnTensorDescriptor_t yDesc, const int n, const int h, const int w) {
checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, k, h, w));
}
int get_xsize(const int n, const int h, const int w) {
return n * c * h * w * sizeof(float);
}
int get_ysize(const int n, const int h, const int w) {
return n * k * h * w * sizeof(float);
}
void set_param(float* data) {
const size_t size = c * k * fsize * fsize;
checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(float)));
checkCudaErrors(cudaMemcpy(W, data, size * sizeof(float), cudaMemcpyHostToDevice));
}
void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x, cudnnTensorDescriptor_t yDesc, float* y) {
const float alpha = 1.0f;
const float beta = 0.0f;
checkCUDNN(cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, W, convDesc, algo, workSpace, workSpaceSizeInBytes, &beta, yDesc, y));
}
private:
CudnnFilterDescriptor wDesc;
CudnnConvolutionDescriptor convDesc;
cudnnConvolutionFwdAlgo_t algo;
size_t workSpaceSizeInBytes;
float* W;
void* workSpace;
};
template<const int c, const int h, const int w>
class Bias {
public:
Bias() {
checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, c, h, w));
}
void set_bias(float* data) {
const size_t size = c * h * w;
checkCudaErrors(cudaMalloc((void**)&b, size * sizeof(float)));
checkCudaErrors(cudaMemcpy(b, data, size * sizeof(float), cudaMemcpyHostToDevice));
}
void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x) {
const float alpha = 1.0f;
const float beta = 1.0f;
checkCUDNN(cudnnAddTensor(handle, &alpha, biasTensorDesc, b, &beta, xDesc, x));
}
private:
CudnnTensorDescriptor biasTensorDesc;
float *b;
};
class ReLU {
public:
ReLU() {
checkCUDNN(cudnnSetActivationDescriptor(activDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
}
void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x) {
const float alpha = 1.0f;
const float beta = 0.0f;
checkCUDNN(cudnnActivationForward(handle, activDesc, &alpha, xDesc, x, &beta, xDesc, x));
}
private:
CudnnActivationDescriptor activDesc;
};
template<const int k, const int n>
class Linear {
public:
void get_xdesc(cudnnTensorDescriptor_t xDesc, const int m) {
checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, m, k, 1, 1));
}
void get_ydesc(cudnnTensorDescriptor_t yDesc, const int m) {
checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, m, n, 1, 1));
}
void set_param(float* data) {
const size_t size = k * n;
checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(float)));
checkCudaErrors(cudaMemcpy(W, data, size * sizeof(float), cudaMemcpyHostToDevice));
}
void operator() (cublasHandle_t handle, const int m, float* x, float* y) {
const float alpha = 1.0f;
const float beta = 0.0f;
checkCublasErrors(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, m, k, &alpha, W, k, x, k, &beta, y, n));
}
private:
float* W;
};
template<const int window, const int stride = window, const int pad = 0>
class MaxPooling2D {
public:
MaxPooling2D() {
checkCUDNN(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, window, window, pad, pad, stride, stride));
}
int get_yh(const int h) {
return (h + 2 * pad - window) / stride + 1;
}
int get_yw(const int w) {
return (w + 2 * pad - window) / stride + 1;
}
void get_desc(cudnnTensorDescriptor_t desc, const int n, const int c, const int h, const int w) {
checkCUDNN(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
}
void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x, cudnnTensorDescriptor_t yDesc, float* y) {
const float alpha = 1.0f;
const float beta = 0.0f;
checkCUDNN(cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y));
}
private:
CudnnPoolingDescriptor poolingDesc;
};
nn.cpp
#include "nn.h"
#include "npz.h"
CudnnHandle NN::cudnnHandle;
CublasHandle NN::cublasHandle;
NN::NN()
{
conv1.get_xdesc(xDesc, batch_size, IMAGE_H, IMAGE_W);
const int h1_h = conv1.get_yh(IMAGE_H);
const int h1_w = conv1.get_yw(IMAGE_W);
conv1.get_ydesc(h1Desc, batch_size, h1_h, h1_w);
const int h2_h = max_pooling_2d.get_yh(h1_h);
const int h2_w = max_pooling_2d.get_yw(h1_w);
conv2.get_xdesc(h2Desc, batch_size, h2_h, h2_w);
const int h3_h = conv1.get_yh(h2_h);
const int h3_w = conv1.get_yw(h2_w);
conv2.get_ydesc(h3Desc, batch_size, h3_h, h3_w);
const int h4_h = max_pooling_2d.get_yh(h3_h);
const int h4_w = max_pooling_2d.get_yw(h3_w);
max_pooling_2d.get_desc(h4Desc, batch_size, k, h4_h, h4_w);
l3.get_ydesc(h5Desc, batch_size);
l4.get_ydesc(yDesc, batch_size);
conv1.init(cudnnHandle, xDesc, h1Desc);
conv2.init(cudnnHandle, h2Desc, h3Desc);
checkCudaErrors(cudaMalloc((void**)&x_dev, conv1.get_xsize(batch_size, IMAGE_H, IMAGE_W)));
checkCudaErrors(cudaMalloc((void**)&h1_dev, conv1.get_ysize(batch_size, h1_h, h1_w)));
checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_xsize(batch_size, h2_h, h2_w)));
checkCudaErrors(cudaMalloc((void**)&h3_dev, conv2.get_ysize(batch_size, h3_h, h3_w)));
checkCudaErrors(cudaMalloc((void**)&h4_dev, batch_size * k * h4_h * h4_w * sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&h5_dev, batch_size * fcl * sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&y_dev, batch_size * 10 * sizeof(float)));
}
NN::~NN() {
checkCudaErrors(cudaFree(x_dev));
checkCudaErrors(cudaFree(h1_dev));
checkCudaErrors(cudaFree(h2_dev));
checkCudaErrors(cudaFree(h3_dev));
checkCudaErrors(cudaFree(h4_dev));
checkCudaErrors(cudaFree(h5_dev));
checkCudaErrors(cudaFree(y_dev));
}
※cudnnHandle_tなどの生成破棄のコードは薄いラッパーを記述して、CudnnHandleなどのクラスにしている。