打开VS2017后,文件——新建——项目;
找到NVIDIA,有的人说自己的VS中没看见NVIDIA这一项啊,那是因为没有你没有安装CUDA,或者你在安装CUDA的时候参照某教程将Visual Studio Integration 取消勾选安装,其实后来再重新装上就行。
创建一个文件夹名为 cuda_test 的项目,然后我们发现其实里面已经有 .cu 文件了,如下图所示。
然后,我们像C语言一样生成编译文件,最终结果如下:
接下来,我们修改代码如下,并运行以下代码。
#include <stdio.h>
const int N = 16;
const int blocksize = 16;
__global__ void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = "Hello \0\0\0\0\0\0";
int b[N] = { 15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
char *ad;
int *bd;
const int csize = N * sizeof(char);
const int isize = N * sizeof(int);
printf("%s", a);
cudaMalloc((void**)&ad, csize);
cudaMalloc((void**)&bd, isize);
cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);
cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);
dim3 dimBlock(blocksize, 1);
dim3 dimGrid(1, 1);
hello << <dimGrid, dimBlock >> > (ad, bd);
cudaMemcpy(a, ad, csize, cudaMemcpyDeviceToHost);
cudaFree(ad);
cudaFree(bd);
printf("%s\n", a);
return EXIT_SUCCESS;
}
编译运行成功,最终结果如下:
运行以下代码,查看电脑设备。
/*********************************************/
#include <stdio.h>
#include <cuda_runtime.h>
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %d.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %d.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate); // 输出的是GPU的时钟频率
printf("textureAlignment : %d.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}
bool InitCUDA()
{
//used to count the device numbers
int count;
// 获取CUDA设备数
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
// find the device >= 1.X
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { //cudaGetDeviceProperties获取 CUDA 设备属性
if (prop.major >= 1) {
printDeviceProp(prop);
break;
}
}
}
// if can't find the device
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
// set cuda device
cudaSetDevice(i);
return true;
}
int main(int argc, char const *argv[])
{
if (InitCUDA()) {
printf("CUDA initialized.\n");
}
return 0;
}
程序运行结果:
查看当前系统上安装的GPU设备的详细参数
打开文件夹C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.2\1_Utilities
其中有一个名为deviceQuery的程序,可编译执行,即可打出当前系统上安装的GPU设备的详细参数。结果如下:
CUDA 初始化
/*
CUDA初始化
*/
#include "cuda_runtime.h"
#include <stdio.h>
//CUDA 初始化
bool InitCUDA(){
int count;
cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
int main(){
if (!InitCUDA()) {
return 0;
}//CUDA 初始化
printf("CUDA initialized.\n");
return 0;
}
运行得到结果:说明系统上有支持CUDA的装置。
完整的向量点积CUDA程序
/*
完整的向量点积CUDA程序
a=[a1,a2,…an], b=[b1,b2,…bn]
a*b=a1*b1+a2*b2+…+an*bn
*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 10
__global__ void Dot(int *a, int *b, int *c) //声明kernel函数
{
__shared__ int temp[N]; // 声明在共享存储中的变量
temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads(); // 此处有下划线,并不是错误,而是VS智能感知有问题,
if (0 == threadIdx.x)
{
int sum = 0;
for (int i = 0; i < N; i++)
sum += temp[i];
*c = sum;
printf("sum Calculated on Device:%d\n", *c);
}
}
void random_ints(int *a, int n)
{
for (int i = 0; i < n; i++)
*(a + i) = rand() % 10;
}
int main()
{
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, sizeof(int));
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(sizeof(int));
printf("Array a[N]:\n");
for (int i = 0; i < N; i++) printf("%d ", a[i]);
printf("\n");
printf("Array b[N]:\n");
for (int i = 0; i < N; i++) printf("%d ", b[i]);
printf("\n\n");
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
Dot << <1, N >> > (d_a, d_b, d_c); //单block多thread
cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
int sumHost = 0;
for (int i = 0; i < N; i++)
sumHost += a[i] * b[i];
printf("sum Calculated on Host=%d\n", sumHost);
printf("Device to Host: a*b=%d\n", *c);
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
运行结果如下:计算随机数向量立方和的CUDA程序
/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h> // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10; // 0~32767之间的随机数
}
}
//CUDA 初始化
bool InitCUDA()
{
int count;
//取得支持Cuda的装置的数目
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int* result)
{
int sum = 0;
int i;
for (i = 0; i < DATA_SIZE; i++) {
sum += num[i] * num[i] * num[i];
}
*result = sum;
}
int main()
{ //CUDA 初始化
if (!InitCUDA()) {
return 0;
}
//生成随机数
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result; // device中的参数
int gpusum = 0; //CPU中的参数,但是是为了记录gpu中求到的sum.
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
sumOfcubes <<<1,1,0>>> (gpudata, result); // gpudata, result 是传给设备本身的参数
cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost); //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
cudaFree(gpudata);
cudaFree(result);
printf("GPUsum: %d \n", gpusum);
int sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", sum);
return 0;
}
运行结果如下:计算随机数向量立方和的CUDA程序(加入统计时间的函数)
/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <time.h> // 为了使用clock_t
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h> // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
for (int i = 0; i < size; i++) {
number[i] = rand() % 10; // 0~32767之间的随机数
}
}
//CUDA 初始化
bool InitCUDA()
{
int count;
//取得支持Cuda的装置的数目
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int *result, clock_t *time)
{
int sum = 0;
int i;
clock_t start = clock();
for (i = 0; i < DATA_SIZE; i++) {
sum += num[i] * num[i] * num[i];
}
*result = sum;
*time = clock() - start;
}
int main()
{ //CUDA 初始化
if (!InitCUDA()) {
return 0;
}
//生成随机数
GenerateNumbers(data, DATA_SIZE);
int* gpudata, *result; // device中的参数
int gpusum = 0; //CPU中的参数,但是是为了记录gpu中求到的sum.
clock_t* time;
clock_t time_used;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&result, sizeof(int));
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
sumOfcubes <<<1,1,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost); //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
printf("GPUsum: %d——time:%d\n", gpusum, time_used);
int sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", sum);
return 0;
}
结果如下:GTX 1050上运行核函数部分的时间约为:
1992949251/ 1493000 = 1334.8 mS 其中1493000是GPU频率。
重要!!!
也就是clock() 统计出来的 (end - start) / GPU频率 = 毫秒级单位的时间 其中利用***获取的GPU频率单位是 千赫兹,比如我的GTX1050 的频率是 1493000千赫兹。
进一步优化上述求立方和的程序(利用单block多thread)
/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include <time.h> // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h> // 为了使用rand 函数
//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h>
//#include <malloc.h>
#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //线程的数量
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size){
for (int i = 0; i < size; i++) {
number[i] = rand() % 10; // 0~32767之间的随机数
}
}
//CUDA 初始化
bool InitCUDA(){
int count;
//取得支持Cuda的装置的数目
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int *result, clock_t *time){
const int tid = threadIdx.x;
const int size = DATA_SIZE / THREAD_NUM;//计算每个线程需要完成的量
int sum = 0;
int i;
clock_t start; // 记录运算开始的时间
if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
for (i = tid * size; i < (tid + 1) * size; i++){
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
//计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
if (tid == 0) *time = clock() - start;
}
int main(){
if (!InitCUDA()) {return 0;}//CUDA 初始化
GenerateNumbers(data, DATA_SIZE); //生成随机数
int* gpudata, *result; // device中的参数
int gpusum[THREAD_NUM]; // CPU中记录GPU中记录求和的参数
clock_t* time;
clock_t time_used;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);
//cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
// 启动kernel函数
sumOfcubes<<<1,THREAD_NUM,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
clock_t time_use;
//cudaMemcpy 将结果从显存中复制回CPU内存 gpusum
cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0; /*立方和归约*/
for (int i = 0; i < THREAD_NUM; i++){
final_sum += gpusum[i];
}
printf("GPUsum: %d gputime: %d\n", final_sum, time_use);
// 计算CPU中计算时的结果。
final_sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
final_sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", final_sum);
return 0;
}
结果如下:然而,上述程序还能够优化,上述代码:
if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
for (i = tid * size; i < (tid + 1) * size; i++){
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
//计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
if (tid == 0) *time = clock() - start;
}
中核函数并不是连续存取的,读取数字完全是跳跃式的读取,这会非常影响内存的存取效率。因此我们下一步要将取数字的过程变成:thread 0 读取第一个数字,thread 1 读取第二个数字,以此类推......
程序部分修改为:
if (tid == 0) start = clock(); //只在thread 0(即threadIdx.x = 0 的时候)进行记录
for (i = tid; i < DATA_SIZE; i+= THREAD_NUM){
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
//计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行
if (tid == 0) *time = clock() - start;
实验运行结果:???问题来了,老师上课的时候明明说,这样的优化会使得程序运行时间缩短很多倍,但是为啥我的运行时间结果没啥大的改变呢。8746726/8365217=1.04,几乎就是没有改进嘛.....难道是程序出现了问题,于是,我就将两个程序放在服务器上运行了。结果显示 1103530/185263=5.96,有近6倍的提,说明这样的优化程序是正确的,只是在VS2017的编译运行中没有体现出来,可这又是为啥?
经过一番查询资料,发现VS2017中我在编译运行的时候用的是debug模式,其实需要改成Release模式。
修改完编译器中的模式之后,发现结果果然不一样了,1060761/166149 = 6.38,说明优化后的程序有很大的提升,因此,得出一个结果,VS中的debug模式只能用来修改程序中的错误,但是不能说明真正的程序性能的好坏,谨记,要改为Release模式。
更进一步的优化,多block多thread
CUDA不仅提供了Thread,还提供了Grid和Block以及Share Memory这些非常重要的机制,每个block中Thread极限是1024,但是通过block和Grid,线程的数量还能成倍增长,甚至用几万个线程。/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include <time.h> // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h> // 为了使用rand 函数
//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h>
//#include <malloc.h>
#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM 32// block 数量
//配置32 个blocks,每个blocks 有256个threads,总共有32*256= 8192个threads。
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
for (int i = 0; i < size; i++) {
number[i] = rand() % 10; // 0~32767之间的随机数
}
}
//CUDA 初始化
bool InitCUDA() {
int count;
cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int sum = 0;
int i;
//只在thread 0(即threadIdx.x = 0 的时候)进行记录,每个block 都会记录开始时间及结束时间
if (tid == 0) time[bid] = clock();
//thread需要同时通过tid和bid来确定,并保证内存连续性
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
sum += num[i] * num[i] * num[i];
}
//Result的数量随之增加
result[bid * THREAD_NUM + tid] = sum;
//计算时间的动作,只在thread 0(即threadIdx.x = 0 的时候)进行,每个block 都会记录开始时间及结束时间
if (tid == 0) time[bid + BLOCK_NUM] = clock();
}
int main() {
if (!InitCUDA()) { return 0; }//CUDA 初始化
GenerateNumbers(data, DATA_SIZE); //生成随机数
int *gpudata, *result; // device中的参数
clock_t *time;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM*BLOCK_NUM);
//cudaMemcpy 将数据从cpu复制到device内存中
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
// 启动kernel函数
sumOfcubes << <BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数
int gpusum[THREAD_NUM*BLOCK_NUM]; // CPU中记录GPU中记录求和的参数
clock_t time_use[BLOCK_NUM * 2];
//cudaMemcpy 将结果从显存中复制回CPU内存
cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0; /*立方和归约*/
for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++) {
final_sum += gpusum[i];
}
//采取新的计时策略把每个block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
clock_t min_start, max_end;
min_start = time_use[0];
max_end = time_use[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++) { //把每个block最早的开始时间和最晚结束的时间相减,取得最终的运行时间
if (min_start > time_use[i])
min_start = time_use[i];
if (max_end < time_use[i + BLOCK_NUM])
max_end = time_use[i + BLOCK_NUM];
}
printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);
// 计算CPU中计算时的结果。
final_sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
final_sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", final_sum);
return 0;
}
会出现以下错误:解决方案:release模式会优化很多东西,但是多重新编译几次,又会成功,真是搞笑,搞不懂这些编译器......... 算了,还是建议去服务器上运行吧,VS中实在有太多无法未知的错误了。
后来修改 THREAD_NUM = 1024 , BLOCK_NUM = 128 ,总共 1024 * 128 = 131073 个threads,性能并没有很大提升了。
#define THREAD_NUM 1024 //thread 数量
#define BLOCK_NUM 128 // block 数量
为什么不用极限的1024个线程呢?
- 如果1024个线程全部用上,那样就是32*1024 = 32768 个线程,难道不是更好吗?从线程运行的原理来看,线程数量达到一定大小后,再一味地增加线程也不会取得性能的提升,反而有可能会让性能下降。线程数达到隐藏各种latency的程度后,之后线程数量的提升就没有太大的作用了。
- 程序中加和部分是在CPU上进行的,越多的线程意味着越多的结果,而这也意味着CPU上的运算压力会越来越大。
Thread 的同步
一个block内的所有的 thread 可以有共享的内存,也可以进行同步,因此,还可以让每个block 内所有 thread 将自己的计算结果相加起来。代码可以进一步修改:
#include "cuda_runtime.h"
#include <time.h> // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h> // 为了使用rand 函数
//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h>
//#include <malloc.h>
#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM 32// block 数量
//配置32 个blocks,每个blocks 有256个threads,总共有32*256= 8192个threads。
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
for (int i = 0; i < size; i++) {
number[i] = rand() % 10; // 0~32767之间的随机数
}
}
//CUDA 初始化
bool InitCUDA() {
int count;
cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for (i = 0; i < count; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
break;
}
}
}
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
//只在thread 0(即threadIdx.x = 0 的时候)进行记录,每个block 都会记录开始时间及结束时间
if (tid == 0) time[bid] = clock();
shared[tid] = 0;
//thread需要同时通过tid和bid来确定,并保证内存连续性
for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
shared[tid] += num[i] * num[i] * num[i];
}
__syncthreads();
if (tid == 0) {
for (i = 1; i < THREAD_NUM; i++) {
shared[0] += shared[i];
}
result[tid] = shared[0];
}
if (tid == 0) time[bid + BLOCK_NUM] = clock();
}
int main() {
if (!InitCUDA()) { return 0; }//CUDA 初始化
GenerateNumbers(data, DATA_SIZE); //生成随机数
int *gpudata, *result; // device中的参数
clock_t *time;
cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);
cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
// 启动kernel函数
sumOfcubes << <BLOCK_NUM, THREAD_NUM, sizeof(int)* THREAD_NUM >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数
int gpusum[BLOCK_NUM]; // CPU中记录GPU中记录求和的参数
clock_t time_use[BLOCK_NUM * 2];
cudaMemcpy(&gpusum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int final_sum = 0; /*立方和归约*/
for (int i = 0; i <BLOCK_NUM; i++) {
final_sum += gpusum[i];
}
//采取新的计时策略把每个block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
clock_t min_start, max_end;
min_start = time_use[0];
max_end = time_use[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++) { //把每个block最早的开始时间和最晚结束的时间相减,取得最终的运行时间
if (min_start > time_use[i])
min_start = time_use[i];
if (max_end < time_use[i + BLOCK_NUM])
max_end = time_use[i + BLOCK_NUM];
}
printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);
// 计算CPU中计算时的结果。
final_sum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
final_sum += data[i] * data[i] * data[i];
}
printf("CPUsum: %d \n", final_sum);
return 0;
}
树状加法
参考:
http://hpc.pku.edu.cn/docs/20170829223804057249.pdf 深入浅出谈CUDA
#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
#define BLOCK_DIM 16
/**
* Computes the squared Euclidean distance matrix between the query points and the reference points.
*
* @param ref refence points stored in the global memory
* @param ref_width number of reference points
* @param ref_pitch pitch of the reference points array in number of column
* @param query query points stored in the global memory
* @param query_width number of query points
* @param query_pitch pitch of the query points array in number of columns
* @param height dimension of points = height of texture `ref` and of the array `query`
* @param dist array containing the query_width x ref_width computed distances
*/
__global__ void compute_distances(float * ref,
int ref_width,
int ref_pitch,
float * query,
int query_width,
int query_pitch,
int height,
float * dist) {
// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
__shared__ int begin_A;
__shared__ int begin_B;
__shared__ int step_A;
__shared__ int step_B;
__shared__ int end_A;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Initializarion of the SSD for the current thread
float ssd = 0.f;
// Loop parameters
begin_A = BLOCK_DIM * blockIdx.y;
begin_B = BLOCK_DIM * blockIdx.x;
step_A = BLOCK_DIM * ref_pitch;
step_B = BLOCK_DIM * query_pitch;
end_A = begin_A + (height - 1) * ref_pitch;
// Conditions
int cond0 = (begin_A + tx < ref_width); // used to write in shared memory
int cond1 = (begin_B + tx < query_width); // used to write in shared memory & to computations and to write in output array
int cond2 = (begin_A + ty < ref_width); // used to computations and to write in output matrix
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
if (a / ref_pitch + ty < height) {
shared_A[ty][tx] = (cond0) ? ref[a + ref_pitch * ty + tx] : 0;
shared_B[ty][tx] = (cond1) ? query[b + query_pitch * ty + tx] : 0;
}
else {
shared_A[ty][tx] = 0;
shared_B[ty][tx] = 0;
}
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
if (cond2 && cond1) {
for (int k = 0; k < BLOCK_DIM; ++k) {
float tmp = shared_A[k][ty] - shared_B[k][tx];
ssd += tmp * tmp;
}
}
// Synchronize to make sure that the preceeding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory; each thread writes one element
if (cond2 && cond1) {
dist[(begin_A + ty) * query_pitch + begin_B + tx] = ssd;
}
}
/**
* Computes the squared Euclidean distance matrix between the query points and the reference points.
*
* @param ref refence points stored in the texture memory
* @param ref_width number of reference points
* @param query query points stored in the global memory
* @param query_width number of query points
* @param query_pitch pitch of the query points array in number of columns
* @param height dimension of points = height of texture `ref` and of the array `query`
* @param dist array containing the query_width x ref_width computed distances
*/
__global__ void compute_distance_texture(cudaTextureObject_t ref,
int ref_width,
float * query,
int query_width,
int query_pitch,
int height,
float* dist) {
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if (xIndex < query_width && yIndex < ref_width) {
float ssd = 0.f;
for (int i = 0; i < height; i++) {
float tmp = tex2D<float>(ref, (float)yIndex, (float)i) - query[i * query_pitch + xIndex];
ssd += tmp * tmp;
}
dist[yIndex * query_pitch + xIndex] = ssd;
}
}
/**
* For each reference point (i.e. each column) finds the k-th smallest distances
* of the distance matrix and their respective indexes and gathers them at the top
* of the 2 arrays.
*
* Since we only need to locate the k smallest distances, sorting the entire array
* would not be very efficient if k is relatively small. Instead, we perform a
* simple insertion sort by eventually inserting a given distance in the first
* k values.
*
* @param dist distance matrix
* @param dist_pitch pitch of the distance matrix given in number of columns
* @param index index matrix
* @param index_pitch pitch of the index matrix given in number of columns
* @param width width of the distance matrix and of the index matrix
* @param height height of the distance matrix
* @param k number of values to find
*/
__global__ void modified_insertion_sort(float * dist,
int dist_pitch,
int * index,
int index_pitch,
int width,
int height,
int k) {
// Column position
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
// Do nothing if we are out of bounds
if (xIndex < width) {
// Pointer shift
float * p_dist = dist + xIndex;
int * p_index = index + xIndex;
// Initialise the first index
p_index[0] = 0;
// Go through all points
for (int i = 1; i < height; ++i) {
// Store current distance and associated index
float curr_dist = p_dist[i*dist_pitch];
int curr_index = i;
// Skip the current value if its index is >= k and if it's higher the k-th slready sorted mallest value
if (i >= k && curr_dist >= p_dist[(k - 1)*dist_pitch]) {
continue;
}
// Shift values (and indexes) higher that the current distance to the right
int j = min(i, k - 1);
while (j > 0 && p_dist[(j - 1)*dist_pitch] > curr_dist) {
p_dist[j*dist_pitch] = p_dist[(j - 1)*dist_pitch];
p_index[j*index_pitch] = p_index[(j - 1)*index_pitch];
--j;
}
// Write the current distance and index at their position
p_dist[j*dist_pitch] = curr_dist;
p_index[j*index_pitch] = curr_index;
}
}
}
/**
* Computes the square root of the first k lines of the distance matrix.
*
* @param dist distance matrix
* @param width width of the distance matrix
* @param pitch pitch of the distance matrix given in number of columns
* @param k number of values to consider
*/
__global__ void compute_sqrt(float * dist, int width, int pitch, int k) {
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if (xIndex < width && yIndex < k)
dist[yIndex*pitch + xIndex] = sqrt(dist[yIndex*pitch + xIndex]);
}
/**
* Computes the squared norm of each column of the input array.
*
* @param array input array
* @param width number of columns of `array` = number of points
* @param pitch pitch of `array` in number of columns
* @param height number of rows of `array` = dimension of the points
* @param norm output array containing the squared norm values
*/
__global__ void compute_squared_norm(float * array, int width, int pitch, int height, float * norm) {
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (xIndex < width) {
float sum = 0.f;
for (int i = 0; i < height; i++) {
float val = array[i*pitch + xIndex];
sum += val * val;
}
norm[xIndex] = sum;
}
}
/**
* Add the reference points norm (column vector) to each colum of the input array.
*
* @param array input array
* @param width number of columns of `array` = number of points
* @param pitch pitch of `array` in number of columns
* @param height number of rows of `array` = dimension of the points
* @param norm reference points norm stored as a column vector
*/
__global__ void add_reference_points_norm(float * array, int width, int pitch, int height, float * norm) {
unsigned int tx = threadIdx.x;
unsigned int ty = threadIdx.y;
unsigned int xIndex = blockIdx.x * blockDim.x + tx;
unsigned int yIndex = blockIdx.y * blockDim.y + ty;
__shared__ float shared_vec[16];
if (tx == 0 && yIndex < height)
shared_vec[ty] = norm[yIndex];
__syncthreads();
if (xIndex < width && yIndex < height)
array[yIndex*pitch + xIndex] += shared_vec[ty];
}
/**
* Adds the query points norm (row vector) to the k first lines of the input
* array and computes the square root of the resulting values.
*
* @param array input array
* @param width number of columns of `array` = number of points
* @param pitch pitch of `array` in number of columns
* @param k number of neighbors to consider
* @param norm query points norm stored as a row vector
*/
__global__ void add_query_points_norm_and_sqrt(float * array, int width, int pitch, int k, float * norm) {
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
if (xIndex < width && yIndex < k)
array[yIndex*pitch + xIndex] = sqrt(array[yIndex*pitch + xIndex] + norm[xIndex]);
}
bool knn_cuda_global(const float * ref,
int ref_nb,
const float * query,
int query_nb,
int dim,
int k,
float * knn_dist,
int * knn_index) {
// Constants
const unsigned int size_of_float = sizeof(float);
const unsigned int size_of_int = sizeof(int);
// Return variables
cudaError_t err0, err1, err2, err3;
// Check that we have at least one CUDA device
int nb_devices;
err0 = cudaGetDeviceCount(&nb_devices);
if (err0 != cudaSuccess || nb_devices == 0) {
printf("ERROR: No CUDA device found\n");
return false;
}
// Select the first CUDA device as default
err0 = cudaSetDevice(0);
if (err0 != cudaSuccess) {
printf("ERROR: Cannot set the chosen CUDA device\n");
return false;
}
// Allocate global memory
float * ref_dev = NULL;
float * query_dev = NULL;
float * dist_dev = NULL;
int * index_dev = NULL;
size_t ref_pitch_in_bytes;
size_t query_pitch_in_bytes;
size_t dist_pitch_in_bytes;
size_t index_pitch_in_bytes;
err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb * size_of_float, dim);
err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess) {
printf("ERROR: Memory allocation error\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Deduce pitch values
size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
size_t query_pitch = query_pitch_in_bytes / size_of_float;
size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
size_t index_pitch = index_pitch_in_bytes / size_of_int;
// Check pitch values
if (query_pitch != dist_pitch || query_pitch != index_pitch) {
printf("ERROR: Invalid pitch value\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Copy reference and query data from the host to the device
err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
if (err0 != cudaSuccess || err1 != cudaSuccess) {
printf("ERROR: Unable to copy data from host to device\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Compute the squared Euclidean distances
dim3 block0(BLOCK_DIM, BLOCK_DIM, 1);
dim3 grid0(query_nb / BLOCK_DIM, ref_nb / BLOCK_DIM, 1);
if (query_nb % BLOCK_DIM != 0) grid0.x += 1;
if (ref_nb % BLOCK_DIM != 0) grid0.y += 1;
compute_distances << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, query_dev, query_nb, query_pitch, dim, dist_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Sort the distances with their respective indexes
dim3 block1(256, 1, 1);
dim3 grid1(query_nb / 256, 1, 1);
if (query_nb % 256 != 0) grid1.x += 1;
modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Compute the square root of the k smallest distances
dim3 block2(16, 16, 1);
dim3 grid2(query_nb / 16, k / 16, 1);
if (query_nb % 16 != 0) grid2.x += 1;
if (k % 16 != 0) grid2.y += 1;
compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Copy k smallest distances / indexes from the device to the host
err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
if (err0 != cudaSuccess || err1 != cudaSuccess) {
printf("ERROR: Unable to copy data from device to host\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Memory clean-up
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return true;
}
bool knn_cuda_texture(const float * ref,
int ref_nb,
const float * query,
int query_nb,
int dim,
int k,
float * knn_dist,
int * knn_index) {
// Constants
unsigned int size_of_float = sizeof(float);
unsigned int size_of_int = sizeof(int);
// Return variables
cudaError_t err0, err1, err2;
// Check that we have at least one CUDA device
int nb_devices;
err0 = cudaGetDeviceCount(&nb_devices);
if (err0 != cudaSuccess || nb_devices == 0) {
printf("ERROR: No CUDA device found\n");
return false;
}
// Select the first CUDA device as default
err0 = cudaSetDevice(0);
if (err0 != cudaSuccess) {
printf("ERROR: Cannot set the chosen CUDA device\n");
return false;
}
// Allocate global memory
float * query_dev = NULL;
float * dist_dev = NULL;
int * index_dev = NULL;
size_t query_pitch_in_bytes;
size_t dist_pitch_in_bytes;
size_t index_pitch_in_bytes;
err0 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
err1 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
err2 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess) {
printf("ERROR: Memory allocation error (cudaMallocPitch)\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Deduce pitch values
size_t query_pitch = query_pitch_in_bytes / size_of_float;
size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
size_t index_pitch = index_pitch_in_bytes / size_of_int;
// Check pitch values
if (query_pitch != dist_pitch || query_pitch != index_pitch) {
printf("ERROR: Invalid pitch value\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Copy query data from the host to the device
err0 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
if (err0 != cudaSuccess) {
printf("ERROR: Unable to copy data from host to device\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Allocate CUDA array for reference points
cudaArray* ref_array_dev = NULL;
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
err0 = cudaMallocArray(&ref_array_dev, &channel_desc, ref_nb, dim);
if (err0 != cudaSuccess) {
printf("ERROR: Memory allocation error (cudaMallocArray)\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
return false;
}
// Copy reference points from host to device
err0 = cudaMemcpyToArray(ref_array_dev, 0, 0, ref, ref_nb * size_of_float * dim, cudaMemcpyHostToDevice);
if (err0 != cudaSuccess) {
printf("ERROR: Unable to copy data from host to device\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
return false;
}
// Resource descriptor
struct cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypeArray;
res_desc.res.array.array = ref_array_dev;
// Texture descriptor
struct cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModePoint;
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = 0;
// Create the texture
cudaTextureObject_t ref_tex_dev = 0;
err0 = cudaCreateTextureObject(&ref_tex_dev, &res_desc, &tex_desc, NULL);
if (err0 != cudaSuccess) {
printf("ERROR: Unable to create the texture\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
return false;
}
// Compute the squared Euclidean distances
dim3 block0(16, 16, 1);
dim3 grid0(query_nb / 16, ref_nb / 16, 1);
if (query_nb % 16 != 0) grid0.x += 1;
if (ref_nb % 16 != 0) grid0.y += 1;
compute_distance_texture << <grid0, block0 >> > (ref_tex_dev, ref_nb, query_dev, query_nb, query_pitch, dim, dist_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
cudaDestroyTextureObject(ref_tex_dev);
return false;
}
// Sort the distances with their respective indexes
dim3 block1(256, 1, 1);
dim3 grid1(query_nb / 256, 1, 1);
if (query_nb % 256 != 0) grid1.x += 1;
modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
cudaDestroyTextureObject(ref_tex_dev);
return false;
}
// Compute the square root of the k smallest distances
dim3 block2(16, 16, 1);
dim3 grid2(query_nb / 16, k / 16, 1);
if (query_nb % 16 != 0) grid2.x += 1;
if (k % 16 != 0) grid2.y += 1;
compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
cudaDestroyTextureObject(ref_tex_dev);
return false;
}
// Copy k smallest distances / indexes from the device to the host
err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
if (err0 != cudaSuccess || err1 != cudaSuccess) {
printf("ERROR: Unable to copy data from device to host\n");
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
cudaDestroyTextureObject(ref_tex_dev);
return false;
}
// Memory clean-up
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFreeArray(ref_array_dev);
cudaDestroyTextureObject(ref_tex_dev);
return true;
}
bool knn_cublas(const float * ref,
int ref_nb,
const float * query,
int query_nb,
int dim,
int k,
float * knn_dist,
int * knn_index) {
// Constants
const unsigned int size_of_float = sizeof(float);
const unsigned int size_of_int = sizeof(int);
// Return variables
cudaError_t err0, err1, err2, err3, err4, err5;
// Check that we have at least one CUDA device
int nb_devices;
err0 = cudaGetDeviceCount(&nb_devices);
if (err0 != cudaSuccess || nb_devices == 0) {
printf("ERROR: No CUDA device found\n");
return false;
}
// Select the first CUDA device as default
err0 = cudaSetDevice(0);
if (err0 != cudaSuccess) {
printf("ERROR: Cannot set the chosen CUDA device\n");
return false;
}
// Initialize CUBLAS
cublasInit();
// Allocate global memory
float * ref_dev = NULL;
float * query_dev = NULL;
float * dist_dev = NULL;
int * index_dev = NULL;
float * ref_norm_dev = NULL;
float * query_norm_dev = NULL;
size_t ref_pitch_in_bytes;
size_t query_pitch_in_bytes;
size_t dist_pitch_in_bytes;
size_t index_pitch_in_bytes;
err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb * size_of_float, dim);
err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
err4 = cudaMalloc((void**)&ref_norm_dev, ref_nb * size_of_float);
err5 = cudaMalloc((void**)&query_norm_dev, query_nb * size_of_float);
if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess || err4 != cudaSuccess || err5 != cudaSuccess) {
printf("ERROR: Memory allocation error\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Deduce pitch values
size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
size_t query_pitch = query_pitch_in_bytes / size_of_float;
size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
size_t index_pitch = index_pitch_in_bytes / size_of_int;
// Check pitch values
if (query_pitch != dist_pitch || query_pitch != index_pitch) {
printf("ERROR: Invalid pitch value\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Copy reference and query data from the host to the device
err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
if (err0 != cudaSuccess || err1 != cudaSuccess) {
printf("ERROR: Unable to copy data from host to device\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Compute the squared norm of the reference points
dim3 block0(256, 1, 1);
dim3 grid0(ref_nb / 256, 1, 1);
if (ref_nb % 256 != 0) grid0.x += 1;
compute_squared_norm << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, dim, ref_norm_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Compute the squared norm of the query points
dim3 block1(256, 1, 1);
dim3 grid1(query_nb / 256, 1, 1);
if (query_nb % 256 != 0) grid1.x += 1;
compute_squared_norm << <grid1, block1 >> > (query_dev, query_nb, query_pitch, dim, query_norm_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Computation of query*transpose(reference)
cublasSgemm('n', 't', (int)query_pitch, (int)ref_pitch, dim, (float)-2.0, query_dev, query_pitch, ref_dev, ref_pitch, (float)0.0, dist_dev, query_pitch);
if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
printf("ERROR: Unable to execute cublasSgemm\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Add reference points norm
dim3 block2(16, 16, 1);
dim3 grid2(query_nb / 16, ref_nb / 16, 1);
if (query_nb % 16 != 0) grid2.x += 1;
if (ref_nb % 16 != 0) grid2.y += 1;
add_reference_points_norm << <grid2, block2 >> > (dist_dev, query_nb, dist_pitch, ref_nb, ref_norm_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Sort each column
modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Add query norm and compute the square root of the of the k first elements
dim3 block3(16, 16, 1);
dim3 grid3(query_nb / 16, k / 16, 1);
if (query_nb % 16 != 0) grid3.x += 1;
if (k % 16 != 0) grid3.y += 1;
add_query_points_norm_and_sqrt << <grid3, block3 >> > (dist_dev, query_nb, dist_pitch, k, query_norm_dev);
if (cudaGetLastError() != cudaSuccess) {
printf("ERROR: Unable to execute kernel\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Copy k smallest distances / indexes from the device to the host
err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
if (err0 != cudaSuccess || err1 != cudaSuccess) {
printf("ERROR: Unable to copy data from device to host\n");
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return false;
}
// Memory clean-up and CUBLAS shutdown
cudaFree(ref_dev);
cudaFree(query_dev);
cudaFree(dist_dev);
cudaFree(index_dev);
cudaFree(ref_norm_dev);
cudaFree(query_norm_dev);
cublasShutdown();
return true;
}