下面是一个利用cuda计算数组元素三次方的c程序。

#include <stdio.h>
// __global__ 是cuda核函数标识符。cuda会将这个函数识别为核函数
__global__ void cube(float * d_out, float * d_in){
	// threadIdx.x 是thread的idx,本文中一共有96个线程
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f * f;
}

int main(int argc, char ** argv) {
	const int ARRAY_SIZE = 96;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	// generate the input array on the host 初始化cuda的输入数组
	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++) {
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE]; // 初始化cuda输出数组

	// declare GPU memory pointers 初始化cpu输入输出输出数组
	float * d_in;
	float * d_out;

	// allocate GPU memory 给cuda数组分配显存
	cudaMalloc((void**) &d_in, ARRAY_BYTES);
	cudaMalloc((void**) &d_out, ARRAY_BYTES);

	// transfer the array to the GPU 将cpu数据传到显存,Host表示cpu, Device 表示GPU
	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

	// launch the kernel 利用核函数进行计算 1表示1block,ArraSize表示96个线程
	cube<<<1, ARRAY_SIZE>>>(d_out, d_in);

	// copy back the result array to the CPU 将GPU数据传回cpu。当然,这一请求是CPU发出的。GPU无法发出任何请求,只能应答。
	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

	// print out the resulting array
	for (int i =0; i < ARRAY_SIZE; i++) {
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	cudaFree(d_in); // 释放显存,内存
	cudaFree(d_out); //释放显存,内存

	return 0;
}
对上面的c文件进行编译

$nvcc -o cube cube.cu

$cube

0.000000 1.0000008.00000027.000000
64.000000 125.000000216.000000343.000000
512.000000 729.0000001000.0000001331.000000
1728.000000 2197.0000002744.0000003375.000000
4096.000000 4913.0000005832.0000006859.000000
8000.000000 9261.00000010648.00000012167.000000
13824.000000 15625.00000017576.00000019683.000000
21952.000000 24389.00000027000.00000029791.000000
32768.000000 35937.00000039304.00000042875.000000
46656.000000 50653.00000054872.00000059319.000000
64000.000000 68921.00000074088.00000079507.000000
85184.000000 91125.00000097336.000000103823.000000
110592.000000 117649.000000125000.000000132651.000000
140608.000000 148877.000000157464.000000166375.000000
175616.000000 185193.000000195112.000000205379.000000
216000.000000 226981.000000238328.000000250047.000000
262144.000000 274625.000000287496.000000300763.000000
314432.000000 328509.000000343000.000000357911.000000
373248.000000 389017.000000405224.000000421875.000000
438976.000000 456533.000000474552.000000493039.000000
512000.000000 531441.000000551368.000000571787.000000
592704.000000 614125.000000636056.000000658503.000000
681472.000000 704969.000000729000.000000753571.000000
778688.000000 804357.000000830584.000000857375.000000

在上面的cuda程序中,当我们调用kernel函数的时候发生了什么呢?

cube<<<1, ARRAY_SIZE>>>(d_out, d_in);
这一命令,告诉GPU,分配处1个block,并在每个block中分配96个线程。

换句话说,GPU可以同时运行多个block。每个block最多有1024个线程。

所以,如果我们要使用960个线程。

既可以,cube<<<1,960>>>(d_out,d_in);

也可以,cube<<<10,96>>>(d_out,d_in);

实际工程中,我们处理图像数据像素数为128*128.

那么我们的核函数可以为cube<<<128,128>>>(d_out,d_in);

或者cube<<<dim3(8,8,1), dim3(16,16,1)>>>(d_out,d_in);

cuda支持三维表示的block,和三维表示的thread数量。

cube<<<dim3(1,1,1),dim3(96,1,1)>>>(d_out,d_in);等价于cube<<<1,96>>>(d_out,d_in);。

核函数还有更加高级的表示。

cube<<<dim3(block_x,block_y,block_z),dim3(thread_x,thread_y,thread_z), shared_memory_in_bytes>>>(d_out,d_in);

shared_memory_in_bytes 的默认值为0;默认参数为shmem=0

threadIdx: 当前block下的线程。有threadIdx.x,threadIdx.y, threadIdx.z属性。

blockDim:size of a block。

blockIdx:block with grid。  blockIdx.x blockIdx.y blockIdx.z

gridDim:size of grid



MAP(elements, function)

elements - set of elements to process

function - function to run on each elements

啊哈,与python的一模一样。

Spark的思想也许来源于此吧。

GPU适合map操作的原因有二:

1.GPU has many parallel processors

2.GPU optimize for throughput

在CUDA中,图片中的像素由以下数据结构表示:

struct uchar4
{
    //red
    unsigned char x;
    // green
    unsigned char y;
    // blue
    unsigned char z;
    // transparant (alpha)
    unsigned char w;
};

rdb2grey.cu
#include <stdio.h>


__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
                       unsigned char* const greyImage,
                       int numRows, int numCols)
{
  int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  int grid_width = gridDim.x * blockDim.x;
  int idx = index_y * grid_width + index_x;
  
  uchar4 rgba = rgbaImage[idx];
  greyImage[idx] = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
}

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
                            unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
 
  // row*col WRONG! thread num is at most 1024
  int m = 32;
  const dim3 blockSize(m, m, 1); 
  const dim3 gridSize( numRows / m + 1, numCols / m + 1, 1);
  rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
  
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}


add a good picture to show threadidx in cuda.


add a good markdown to understand indexing in cude threads.

https://github.com/andreajeka/CUDAThreadIndexing