Chainerで学習したモデルを使ってcuDNNで推論する

ディープラーニングを使ったモデルを学習する際、ディープラーニングフレームワークを使うと使わないとでは生産性に大きな差がある。
多少のオーバーヘッドは許容して、ディープラーニングフレームワークを使う方がトータルでメリットがあると思う。

しかし、推論のみ行う実行環境に限っては、フレームワークに依存したコードだとポータビリティが失われてしまう場合がある。
Pythonが前提のフレームワークだと実行環境でPythonのセットアップが必要になる。
Chainerの場合、Pythonが前提になっているため、実行環境に環境構築が必要になってしまう。
TensorFowの場合、C++でスタティックリンクで利用可能だが、実行バイナリのサイズが肥大化してしまう。
Caffeの場合、比較的薄いレイヤーで実行バイナリがビルドできるが、学習はChainerやTensorFlowに比べると多少扱いにくい。

どのフレームワークも結局、NVIDIAのcuDNNを使用しており、直接cuDNNを使用すれば、特にフレームワークの制限に縛られずに済む。推論のみであれば、特に難しい処理は必要ない。
そこで、学習には、ディープラーニングフレームワークを使用して、実行環境での推論に、cuDNNを使用する方法を試してみた。

学習に使用するディープラーニングフレームワークにはChainerを使用して、Chainerで学習したモデルをC++で読み込んで、cuDNNを使用して推論を行うということを、データセットにMNISTを使用して試した。
※cuDNNのMNISTのサンプルコードはNVIDIAから提供されているが、ミニバッチサイズが1という制限があるため、ミニバッチサイズ2以上でも動作するコードにした。

モデルの定義

モデルは2層の畳み込みニューラルネットワークと出力層に全結合層を使用した、以下のようなモデルを定義した。

from chainer import Chain
import chainer.functions as F
import chainer.links as L

# ネットワーク定義
k = 16
fcl = 256
class NN(Chain):
    def __init__(self):
        super(NN, self).__init__()
        with self.init_scope():
            self.conv1 = L.Convolution2D(in_channels = 1, out_channels = k, ksize = 3, pad = 1)
            self.conv2 = L.Convolution2D(in_channels = k, out_channels = k, ksize = 3, pad = 1)
            self.l3    = L.Linear(7*7*k, fcl)
            self.l4    = L.Linear(fcl, 10)

    def __call__(self, x):
        h = self.conv1(F.reshape(x, (len(x), 1, 28, 28)))
        h = F.max_pooling_2d(F.relu(h), 2)
        h = self.conv2(h)
        h = F.max_pooling_2d(F.relu(h), 2)
        h = F.relu(self.l3(h))
        return self.l4(h)

学習

このモデルを以下のコードで学習した。

import numpy as np
import chainer
import chainer.functions as F
from chainer import cuda
from chainer import datasets, iterators, optimizers, serializers

import argparse

from nn import NN

# 引数の定義
parser = argparse.ArgumentParser(description='example: MNIST')
parser.add_argument('--batchsize', '-b', type=int, default=100,
                    help='Number of images in each mini-batch')
parser.add_argument('--epoch', '-e', type=int, default=20,
                    help='Number of sweeps over the dataset to train')
parser.add_argument('--gpu', '-g', type=int, default=-1,
                    help='GPU ID (negative value indicates CPU)')
parser.add_argument('--initmodel', '-m', default='',
                    help='Initialize the model from given file')
parser.add_argument('--resume', '-r', default='',
                    help='Resume the optimization from snapshot')
args = parser.parse_args()

print('GPU: {}'.format(args.gpu))
print('# Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))

# モデルの作成
model = NN()
# モデルをGPUに転送
if args.gpu >= 0:
    cuda.get_device_from_id(args.gpu).use()
    model.to_gpu()

# 最適化手法の設定
optimizer = optimizers.SGD()
optimizer.setup(model)

# 保存したモデルを読み込み
if args.initmodel:
    print('Load model from', args.initmodel)
    serializers.load_npz(args.initmodel, model)
# 保存した最適化状態を復元
if args.resume:
    print('Load optimizer state from', args.resume)
    serializers.load_npz(args.resume, optimizer)

# MNISTデータセットを読み込み
train, test = datasets.get_mnist()

train_iter = iterators.SerialIterator(train, args.batchsize)
test_iter = iterators.SerialIterator(test, args.batchsize, shuffle=False)

# 学習ループ
for epoch in range(1, args.epoch + 1):
    # ミニバッチ単位で学習
    sum_loss = 0
    itr = 0
    for i in range(0, len(train), args.batchsize):
        # ミニバッチデータ
        train_batch = train_iter.next()
        x, t = chainer.dataset.concat_examples(train_batch, args.gpu)

        # 順伝播
        y = model(x)

        # 勾配を初期化
        model.cleargrads()
        # 損失計算
        loss = F.softmax_cross_entropy(y, t)
        # 誤差逆伝播
        loss.backward()
        optimizer.update()

        sum_loss += loss.data
        itr += 1

    # 評価
    sum_test_loss = 0
    sum_test_accuracy = 0
    test_itr = 0
    for i in range(0, len(test), args.batchsize):
        # ミニバッチデータ
        test_batch = test_iter.next()
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                x_test, t_test = chainer.dataset.concat_examples(test_batch, args.gpu)

                # 順伝播
                y_test = model(x_test)
                # 損失計算
                sum_test_loss += F.softmax_cross_entropy(y_test, t_test).data
                # 一致率計算
                sum_test_accuracy += F.accuracy(y_test, t_test).data
                test_itr += 1

    print('epoch={}, train loss={}, test loss={}, accuracy={}'.format(
        optimizer.epoch + 1, sum_loss / itr,
        sum_test_loss / test_itr, sum_test_accuracy / test_itr))

    optimizer.new_epoch()

# モデル保存
print('save the model')
serializers.save_npz('model', model)
# 最適化状態保存
print('save the optimizer')
serializers.save_npz('state', optimizer)

推論

この学習したモデルをC++のコードで読み込んで推論を行う。

MNISTテストデータ読み込み

MNISTのデータは、MNISTのページからtest set imagesをダウンロードした。
データフォーマットについては、同ページの下部のTEST SET IMAGE FILE (t10k-images-idx3-ubyte):に記載されている。
このtest set imagesを、C++のコードで読み込みは、以下のようなコードで行う。

mnistCUDNN.cpp

struct msb_unsigned_int_t {
	union {
		unsigned char byte[4];
		unsigned int val;
	};
};
ifstream& operator >> (ifstream& is, msb_unsigned_int_t& d) {
	is.read((char*)&d.byte[3], 1);
	is.read((char*)&d.byte[2], 1);
	is.read((char*)&d.byte[1], 1);
	is.read((char*)&d.byte[0], 1);
	return is;
}

int main()
{
	// read mnist data
	ifstream ifs("data/t10k-images.idx3-ubyte", ios::in | ios::binary);

	// magic number(32 bit integer)
	msb_unsigned_int_t magic_number;
	ifs >> magic_number;
	if (magic_number.val != 2051) {
		cerr << "illegal magic number" << endl;
		return 1;
	}
	// number of images(32 bit integer)
	msb_unsigned_int_t numberOfImages;
	ifs >> numberOfImages;
	// number of rows(32 bit integer)
	msb_unsigned_int_t rows;
	ifs >> rows;
	// number of columns(32 bit integer)
	msb_unsigned_int_t columns;
	ifs >> columns;

	// ...
}

cuDNNでモデル定義

cuDNNでモデルを定義するには、cuDNN Developer Guideを参考にcuDNNのAPIを使用して行う。
畳み込み層と活性化関数のAPIの使用方法は、以前の日記を参照して欲しい。
WindowsでcuDNNを使用して畳み込みを行う - TadaoYamaokaの開発日記
 WindowsでcuDNNを使用して畳み込みを行う(bias追加) - TadaoYamaokaの開発日記
 WindowsでcuDNNを使用して畳み込みを行う(活性化関数追加) - TadaoYamaokaの開発日記

今回は、レイヤーをテンプレートクラスにして、ネットワーク定義をクラス化して、構造がわかりやすく記述してみた。

layers.h

#pragma once

#include "cudnn_wrapper.h"

template<const int k, const int c, const int fsize, const int pad, const int stride = 1>
class ConvLayer {
public:
	void init(cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, cudnnTensorDescriptor_t yDesc) {
		checkCUDNN(cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, fsize, fsize));
		checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, pad, pad, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
		checkCUDNN(cudnnGetConvolutionForwardAlgorithm(handle, xDesc, wDesc, convDesc, yDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
		checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, wDesc, convDesc, yDesc, algo, &workSpaceSizeInBytes));
		checkCudaErrors(cudaMalloc(&workSpace, workSpaceSizeInBytes));
	}

	int get_yh(const int h) {
		return (h + 2 * pad - fsize) / stride + 1;
	}

	int get_yw(const int w) {
		return (w + 2 * pad - fsize) / stride + 1;
	}

	void get_xdesc(cudnnTensorDescriptor_t xDesc, const int n, const int h, const int w) {
		checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
	}

	void get_ydesc(cudnnTensorDescriptor_t yDesc, const int n, const int h, const int w) {
		checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, k, h, w));
	}

	int get_xsize(const int n, const int h, const int w) {
		return n * c * h * w * sizeof(float);
	}

	int get_ysize(const int n, const int h, const int w) {
		return n * k * h * w * sizeof(float);
	}

	void set_param(float* data) {
		const size_t size = c * k * fsize * fsize;
		checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(float)));
		checkCudaErrors(cudaMemcpy(W, data, size * sizeof(float), cudaMemcpyHostToDevice));
	}

	void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x, cudnnTensorDescriptor_t yDesc, float* y) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		checkCUDNN(cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, W, convDesc, algo, workSpace, workSpaceSizeInBytes, &beta, yDesc, y));
	}

private:
	CudnnFilterDescriptor wDesc;
	CudnnConvolutionDescriptor convDesc;
	cudnnConvolutionFwdAlgo_t algo;
	size_t workSpaceSizeInBytes;
	float* W;
	void* workSpace;
};

template<const int c, const int h, const int w>
class Bias {
public:
	Bias() {
		checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, c, h, w));
	}

	void set_bias(float* data) {
		const size_t size = c * h * w;
		checkCudaErrors(cudaMalloc((void**)&b, size * sizeof(float)));
		checkCudaErrors(cudaMemcpy(b, data, size * sizeof(float), cudaMemcpyHostToDevice));
	}

	void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x) {
		const float alpha = 1.0f;
		const float beta = 1.0f;
		checkCUDNN(cudnnAddTensor(handle, &alpha, biasTensorDesc, b, &beta, xDesc, x));
	}

private:
	CudnnTensorDescriptor biasTensorDesc;
	float *b;
};

class ReLU {
public:
	ReLU() {
		checkCUDNN(cudnnSetActivationDescriptor(activDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0/*reluCeiling*/));
	}

	void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		checkCUDNN(cudnnActivationForward(handle, activDesc, &alpha, xDesc, x, &beta, xDesc, x));
	}

private:
	CudnnActivationDescriptor activDesc;
};

template<const int k, const int n>
class Linear {
public:
	void get_xdesc(cudnnTensorDescriptor_t xDesc, const int m) {
		checkCUDNN(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, m, k, 1, 1));
	}

	void get_ydesc(cudnnTensorDescriptor_t yDesc, const int m) {
		checkCUDNN(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, m, n, 1, 1));
	}

	void set_param(float* data) {
		const size_t size = k * n;
		checkCudaErrors(cudaMalloc((void**)&W, size * sizeof(float)));
		checkCudaErrors(cudaMemcpy(W, data, size * sizeof(float), cudaMemcpyHostToDevice));
	}

	void operator() (cublasHandle_t handle, const int m, float* x, float* y) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		// C = α op ( A ) op ( B ) + β C
		// op ( A ) m × k , op ( B ) k × n and C m × n
		checkCublasErrors(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, m, k, &alpha, W, k, x, k, &beta, y, n));
	}

private:
	float* W;
};

template<const int window, const int stride = window, const int pad = 0>
class MaxPooling2D {
public:
	MaxPooling2D() {
		checkCUDNN(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, window, window, pad, pad, stride, stride));
	}

	int get_yh(const int h) {
		return (h + 2 * pad - window) / stride + 1;
	}

	int get_yw(const int w) {
		return (w + 2 * pad - window) / stride + 1;
	}

	void get_desc(cudnnTensorDescriptor_t desc, const int n, const int c, const int h, const int w) {
		checkCUDNN(cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w));
	}

	void operator() (cudnnHandle_t handle, cudnnTensorDescriptor_t xDesc, float* x, cudnnTensorDescriptor_t yDesc, float* y) {
		const float alpha = 1.0f;
		const float beta = 0.0f;
		checkCUDNN(cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y));
	}

private:
	CudnnPoolingDescriptor poolingDesc;
};

nn.h

#pragma once

#include "layers.h"

const int IMAGE_H = 28;
const int IMAGE_W = 28;

const int batch_size = 2;


class NN {
public:
	typedef float x_t[batch_size][1][IMAGE_H][IMAGE_W];
	typedef float y_t[batch_size][10];

	NN();
	~NN();

	void load_model(const char* filename);

	void foward(x_t x, y_t y);

private:
	static CudnnHandle cudnnHandle;
	static CublasHandle cublasHandle;
	static const int k = 16;
	static const int fcl = 256;

	ConvLayer<k, 1, 3, 1> conv1;
	Bias<k, 1, 1> bias1;
	ConvLayer<k, k, 3, 1> conv2;
	Bias<k, 1, 1> bias2;
	Linear<7 * 7 * k, fcl> l3;
	Bias<fcl, 1, 1> bias3;
	Linear<fcl, 10> l4;
	Bias<10, 1, 1> bias4;

	ReLU relu;
	MaxPooling2D<2> max_pooling_2d;

	CudnnTensorDescriptor xDesc;
	CudnnTensorDescriptor h1Desc;
	CudnnTensorDescriptor h2Desc;
	CudnnTensorDescriptor h3Desc;
	CudnnTensorDescriptor h4Desc;
	CudnnTensorDescriptor h5Desc;
	CudnnTensorDescriptor yDesc;

	float* x_dev;
	float* h1_dev;
	float* h2_dev;
	float* h3_dev;
	float* h4_dev;
	float* h5_dev;
	float* y_dev;
};

nn.cpp

#include "nn.h"
#include "npz.h"

CudnnHandle NN::cudnnHandle;
CublasHandle NN::cublasHandle;

NN::NN()
{
	conv1.get_xdesc(xDesc, batch_size, IMAGE_H, IMAGE_W);

	const int h1_h = conv1.get_yh(IMAGE_H);
	const int h1_w = conv1.get_yw(IMAGE_W);
	conv1.get_ydesc(h1Desc, batch_size, h1_h, h1_w);

	const int h2_h = max_pooling_2d.get_yh(h1_h);
	const int h2_w = max_pooling_2d.get_yw(h1_w);
	conv2.get_xdesc(h2Desc, batch_size, h2_h, h2_w);

	const int h3_h = conv1.get_yh(h2_h);
	const int h3_w = conv1.get_yw(h2_w);
	conv2.get_ydesc(h3Desc, batch_size, h3_h, h3_w);

	const int h4_h = max_pooling_2d.get_yh(h3_h);
	const int h4_w = max_pooling_2d.get_yw(h3_w);
	max_pooling_2d.get_desc(h4Desc, batch_size, k, h4_h, h4_w);

	l3.get_ydesc(h5Desc, batch_size);
	l4.get_ydesc(yDesc, batch_size);

	// init conv layers
	conv1.init(cudnnHandle, xDesc, h1Desc);
	conv2.init(cudnnHandle, h2Desc, h3Desc);

	// malloc
	checkCudaErrors(cudaMalloc((void**)&x_dev, conv1.get_xsize(batch_size, IMAGE_H, IMAGE_W)));
	checkCudaErrors(cudaMalloc((void**)&h1_dev, conv1.get_ysize(batch_size, h1_h, h1_w)));
	checkCudaErrors(cudaMalloc((void**)&h2_dev, conv2.get_xsize(batch_size, h2_h, h2_w)));
	checkCudaErrors(cudaMalloc((void**)&h3_dev, conv2.get_ysize(batch_size, h3_h, h3_w)));
	checkCudaErrors(cudaMalloc((void**)&h4_dev, batch_size * k * h4_h * h4_w * sizeof(float)));
	checkCudaErrors(cudaMalloc((void**)&h5_dev, batch_size * fcl * sizeof(float)));
	checkCudaErrors(cudaMalloc((void**)&y_dev, batch_size * 10 * sizeof(float)));
}

NN::~NN() {
	checkCudaErrors(cudaFree(x_dev));
	checkCudaErrors(cudaFree(h1_dev));
	checkCudaErrors(cudaFree(h2_dev));
	checkCudaErrors(cudaFree(h3_dev));
	checkCudaErrors(cudaFree(h4_dev));
	checkCudaErrors(cudaFree(h5_dev));
	checkCudaErrors(cudaFree(y_dev));
}

※cudnnHandle_tなどの生成破棄のコードは薄いラッパーを記述して、CudnnHandleなどのクラスにしている。

Chainerのモデル読み込み

ChainerのモデルをC++で読み込む方法は以前の日記で書いた方法を使用し、以下のコードのようなコードを記述した。

npz.cpp

#include "npz.h"

#include <zlib.h>
#include <fstream>

using namespace std;

// https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
struct LocalFileHeader
{
	unsigned long local_file_header_signature; // 4_bytes (0x04034b50)
	unsigned short version_needed_to_extract; // 2_bytes
	unsigned short general_purpose_bit_flag; // 2_bytes
	unsigned short compression_method; // 2_bytes
	unsigned short last_mod_file_time; // 2_bytes
	unsigned short last_mod_file_date; // 2_bytes
	unsigned long crc_32; // 4_bytes
	unsigned long compressed_size; // 4_bytes
	unsigned long uncompressed_size; // 4_bytes
	unsigned short file_name_length; // 2_bytes
	unsigned short extra_field_length; // 2_bytes
									   // ここまで30bytes

	//char* file_name; // (variable_size)
	//char* extra_field; // (variable_size)
};

ifstream& operator >> (ifstream& ifs, LocalFileHeader& lfh) {
	ifs.read((char*)&lfh.local_file_header_signature, sizeof(lfh.local_file_header_signature));
	ifs.read((char*)&lfh.version_needed_to_extract, sizeof(lfh.version_needed_to_extract));
	ifs.read((char*)&lfh.general_purpose_bit_flag, sizeof(lfh.general_purpose_bit_flag));
	ifs.read((char*)&lfh.compression_method, sizeof(lfh.compression_method));
	ifs.read((char*)&lfh.last_mod_file_time, sizeof(lfh.last_mod_file_time));
	ifs.read((char*)&lfh.last_mod_file_date, sizeof(lfh.last_mod_file_date));
	ifs.read((char*)&lfh.crc_32, sizeof(lfh.crc_32));
	ifs.read((char*)&lfh.compressed_size, sizeof(lfh.compressed_size));
	ifs.read((char*)&lfh.uncompressed_size, sizeof(lfh.uncompressed_size));
	ifs.read((char*)&lfh.file_name_length, sizeof(lfh.file_name_length));
	ifs.read((char*)&lfh.extra_field_length, sizeof(lfh.extra_field_length));
	return ifs;
}

void load_npz(const char* file, ParamMap& params)
{
	ifstream infile(file, ios_base::in | ios_base::binary);
	if (!infile)
		return;

	while (true)
	{
		// Local file header
		LocalFileHeader lfh;
		infile >> lfh;

		if (lfh.local_file_header_signature != 0x04034b50)
		{
			break;
		}

		char* file_name = new char[lfh.file_name_length + 1];

		infile.read(file_name, lfh.file_name_length);
		file_name[lfh.file_name_length] = '\0';

		infile.seekg(lfh.extra_field_length, ios_base::cur);

		// File data
		unsigned char* file_data = new unsigned char[lfh.compressed_size];
		infile.read((char*)file_data, lfh.compressed_size);

		NPY npy;
		npy.uncompressed_data = new unsigned char[lfh.uncompressed_size];

		z_stream strm = { 0 };
		inflateInit2(&strm, -MAX_WBITS);

		strm.next_in = file_data;
		strm.avail_in = lfh.compressed_size;
		strm.next_out = npy.uncompressed_data;
		strm.avail_out = lfh.uncompressed_size;
		inflate(&strm, Z_NO_FLUSH);
		inflateEnd(&strm);

		// NPY
		const unsigned short header_len = *(unsigned short*)(npy.uncompressed_data + 8);
		npy.data = (float*)(npy.uncompressed_data + 10 + header_len);

		params.emplace(file_name, std::move(npy));

		delete[]  file_name;
	}
}

nn.cpp

void NN::load_model(const char* filepath)
{
	// load nn params
	ParamMap params;
	load_npz(filepath, params);

	conv1.set_param(params["conv1/W.npy"].data);
	bias1.set_bias(params["conv1/b.npy"].data);
	conv2.set_param(params["conv2/W.npy"].data);
	bias2.set_bias(params["conv2/b.npy"].data);
	l3.set_param(params["l3/W.npy"].data);
	bias3.set_bias(params["l3/b.npy"].data);
	l4.set_param(params["l4/W.npy"].data);
	bias4.set_bias(params["l4/b.npy"].data);
}

推論

layer.hのoperator()で定義した処理を使用して推論を行う。

nn.cpp

void NN::foward(x_t x, y_t y)
{
	// input
	checkCudaErrors(cudaMemcpy(x_dev, x, sizeof(x_t), cudaMemcpyHostToDevice));

	// conv1
	conv1(cudnnHandle, xDesc, x_dev, h1Desc, h1_dev);
	bias1(cudnnHandle, h1Desc, h1_dev);
	relu(cudnnHandle, h1Desc, h1_dev);
	max_pooling_2d(cudnnHandle, h1Desc, h1_dev, h2Desc, h2_dev);

	// conv2
	conv2(cudnnHandle, h2Desc, h2_dev, h3Desc, h3_dev);
	bias2(cudnnHandle, h3Desc, h3_dev);
	relu(cudnnHandle, h3Desc, h3_dev);
	max_pooling_2d(cudnnHandle, h3Desc, h3_dev, h4Desc, h4_dev);

	// fcl
	l3(cublasHandle, batch_size, h4_dev, h5_dev);
	bias3(cudnnHandle, h5Desc, h5_dev);
	relu(cudnnHandle, h5Desc, h5_dev);
	l4(cublasHandle, batch_size, h5_dev, y_dev);
	bias4(cudnnHandle, yDesc, y_dev);

	// output
	checkCudaErrors(cudaMemcpy(y, y_dev, sizeof(y_t), cudaMemcpyDeviceToHost));
}

基本的にcuDNNのAPIをリファレンスの説明の通り使用すれば使用できるが、全結合層については、cuDNNではなくcuBlasを使用するので注意が必要である。
cuBlasはメモリ配置がFortranの形式になっており、C言語の多次元配列のメモリ配置のまま使用する場合は、行列の転置と積の順番の入れ替えを行う必要がある。

C言語では2次元配列A[M][N]は、メモリ上では、A[0][0]～A[0][N-1]のデータの次に、A[1][0]～A[1][N-1]のデータが配置されるが、cuBlasは、A[0][0]、A[1][0]、…、A[M-1][0]という順で行と列が入れ替わった順で配置されていることを前提としている。
詳細は、cuBlasのリファレンスを参照してほしい。

つまり、C言語のメモリ配置の行列AとBの積 ${\bf A}{\bf B}$ は、 ${\bf B^T}{\bf A^T}$ とする必要がある。計算後のメモリ配置も転置されている状態にする必要があることに注意する。

layers.hのclass Linearのoperator()の処理が該当する箇所である。

実行結果

C++でcuDNNを使用して推論した結果と、Chainerで推論をした結果を比較して完全に一致することを確認した。

predict.py

import chainer
from chainer import cuda
from chainer import datasets, iterators, serializers

import argparse

from nn import NN

parser = argparse.ArgumentParser(description='example: MNIST')
parser.add_argument('--batchsize', '-b', type=int, default=2,
                    help='Number of images in each mini-batch')
parser.add_argument('--gpu', '-g', type=int, default=-1,
                    help='GPU ID (negative value indicates CPU)')
parser.add_argument('--initmodel', '-m', default='model',
                    help='Initialize the model from given file')
args = parser.parse_args()

# モデルの作成
model = NN()
# モデルをGPUに転送
if args.gpu >= 0:
    cuda.get_device_from_id(args.gpu).use()
    model.to_gpu()

# 保存したモデルを読み込み
print('Load model from', args.initmodel)
serializers.load_npz(args.initmodel, model)

# MNISTデータセットを読み込み
train, test = datasets.get_mnist()

test_iter = iterators.SerialIterator(test, args.batchsize, shuffle=False)

# ミニバッチデータ
test_batch = test_iter.next()
with chainer.no_backprop_mode():
    with chainer.using_config('train', False):
        x_test, t_test = chainer.dataset.concat_examples(test_batch, args.gpu)

        # 順伝播
        y_test = model(x_test)

print(y_test.data)

mnistCUDNN.cpp

int main()
{
	// ...

	// make minibatch
	NN::x_t x;
	for (int i = 0; i < batch_size; i++) {
		for (int h = 0; h < IMAGE_H; h++) {
			for (int w = 0; w < IMAGE_W; w++) {
				// pixel(unsigned byte)
				unsigned char pixel;
				ifs.read((char*)&pixel, 1);

				x[i][0][h][w] = float(pixel) / 255.0f;
			}
		}
	}

	NN nn;

	nn.load_model("../chainer/model");

	NN::y_t y;
	nn.foward(x, y);

	for (int i = 0; i < batch_size; i++) {
		for (int c = 0; c < 10; c++) {
			if (c > 0)
				cout << "\t";
			cout << y[i][c];
		}
		cout << endl;
	}

	return 0;
}

実行時間の比較は、長くなったので別途記事を作成する予定。

ソースをGitHubで公開しました。
github.com
※layers.hのConvLayerのset_paramの「const size_t size = k * k * fsize * fsize;」は、「const size_t size = c * k * fsize * fsize;」の間違いです。