CUDA笔记（二）内存操作

typedef struct
{
    int a;
    int b;
    int c;
    int d;
} MY_TYPE_T;

typedef INTERLEAVED_T MY_TYPE_T[1024]; 

typedef int ARRAY_T[1024];

typedef struct
{
    ARRAY_T a;
    ARRAY_T b;
    ARRAY_T c;
    ARRAY_T d;
}NON_INTERLEAVED_T;

__host__ void add_test_non_interleaved_cpu(NON_INTERLEAVED_T * const host_dest_ptr, const NON_INTERLEAVED_T * const host_src_ptr, const int iter, const num_elements)
{
    for(int index = 0; index < num_elements; index++)
    {
        for(int i = 0; i < iter; i++)
        {
            host_dest_ptr->a[index] += host_src_ptr->a[index];
            host_dest_ptr->b[index] += host_src_ptr->b[index];
            host_dest_ptr->c[index] += host_src_ptr->c[index];
            host_dest_ptr->d[index] += host_src_ptr->d[index];
        }
    }
}

__host__ void add_test_interleaved_cpu(INTERLEAVED_T * const host_dest_ptr, const INTERLEAVED_T * const host_src_ptr, const int iter, const num_elements)
{
    for(int index = 0; index < num_elements; index++)
    {
        for(int i = 0; i < iter; i++)
        {
            host_dest_ptr[index].a += host_src_ptr[index].a;
            host_dest_ptr[index].b += host_src_ptr[index].b;
            host_dest_ptr[index].c += host_src_ptr[index].c;
            host_dest_ptr[index].d += host_src_ptr[index].d;
        }
    }
}

这两个加和函数明显类似，每个函数都对列表中的所有元素迭代iter次，从源数据结构中读取一个值，然后加和到目标数据结构中。利用CPU系统时间统计这两个函数分别运行的时间可以发现
“非交错内存访问方式的执行时间比交错访问方式的时间多出3~4倍。”
这是意料之中的，因为在交错访问的例子中，CPU访问元素a的同时会将结构体中元素b、c和d读入缓存中，使他们在相同的缓存行中。然而非交错版本则需要对4个独立的物理内存进行访问，也就是说存储事务的数目为交错版本的4倍，并且CPU使用的预读策略不会起作用。
我们再看一下GPU版本的代码：

__global__ void add_non_test_interleaved_kernel(NON_INTERLEAVED_T * const gpu_dest_ptr, const NON_INTERLEAVED_T * const gpu_src_ptr, const int iter, const int num_elements)
{
    const int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(tid < num_elements)
    {
        for(int i = 0; i < iter; i++)
        {
            gpu_dest_ptr->a[tid] += gpu_src_ptr->a[tid];
            gpu_dest_ptr->b[tid] += gpu_src_ptr->b[tid];
            gpu_dest_ptr->c[tid] += gpu_src_ptr->c[tid];
            gpu_dest_ptr->d[tid] += gpu_src_ptr->d[tid];
        }
    }
} 

__global__ void add_test_interleaved_kernel(INTERLEAVED_T * const gpu_dest_ptr, const INTERLEAVED_T * const gpu_src_ptr, const int iter, const num_elements)
{
    const int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(tid < num_elements)
    {
        for(int i = 0; i < iter; i++)
        {
            gpu_dest_ptr[tid].a += gpu_src_ptr[tid].a;
            gpu_dest_ptr[tid].b += gpu_src_ptr[tid].b;
            gpu_dest_ptr[tid].c += gpu_src_ptr[tid].c;
            gpu_dest_ptr[tid].d += gpu_src_ptr[tid].d;
        }
    }
}

   这两个函数与CPU版本的类似，不过在GPU上，每个线程迭代iter计算一个元素。利用GPU系统统计分别统计这两个函数运行的时间，可以发现与CPU版本不同，在GPU上
    “交错内存访问方式的执行时间比非交错内存访问方式的时间多出3~4倍。”

因为在GPU上，相比于交错的访问方式，非交错访问使我们得到了4个合并的访问（所有线程访问连续的对齐的内存块），保持全局内存带宽最优。因此，在使用GPU全局内存时，我们要注意连续合并的内存访问方式，从而拥有全局内存带宽最优化。