ASC20 & NVIDIA DLI CUDA C/C++基础培训笔记

Posted on 2020-01-19 In 学习笔记

Day2 NVIDIA DLI 加速计算基础 —— CUDA C/C++

Systems Management Interface

命令查询有关此 GPU 的信息

1	nvidia-smi

Writing Application Code for the GPU

void CPUFunction()
{
  printf("This function is defined to run on the CPU.\n");
}

__global__ void GPUFunction()
{
  printf("This function is defined to run on the GPU.\n");
}

int main()
{
  CPUFunction();

  GPUFunction<<<1, 1>>>();
  cudaDeviceSynchronize();
}

__global__ void GPUFunction()
- 使用__global__关键字定义的函数需要返回 void 类型
GPUFunction<<<1, 1>>>()
- 使用 <<< ... >>> 语法提供执行配置
- 通过执行配置指定线程层次结构

Compiling and Running Accelerated CUDA Code

1	nvcc -arch=sm_70 -o hello-gpu 01-hello/01-hello-gpu.cu -run

-arch
- 虚拟架构特性
- GPU特性

Block Dimensions

惯用表达式threadIdx.x + blockIdx.x * blockDim.x

Allocating Memory

// CPU-only

int N = 2<<20;
size_t size = N * sizeof(int);

int *a;
a = (int *)malloc(size);

// Use `a` in CPU-only program.

free(a);
// Accelerated

int N = 2<<20;
size_t size = N * sizeof(int);

int *a;
// Note the address of `a` is passed as first argument.
cudaMallocManaged(&a, size);

// Use `a` on the CPU and/or on any GPU in the accelerated system.

cudaFree(a);

Manual Memory Allocation and Copying

cudaMalloc 为 GPU 分配内存
- 防止 GPU 分页错误
cudaMallocHost 为 CPU 分配内存
- 钉固内存或锁页内存
- cudaFreeHost 命令释放钉固内存
- cudaMemcpy 命令均可拷贝（而非传输）内存

int *host_a, *device_a;        // Define host-specific and device-specific arrays.
cudaMalloc(&device_a, size);   // `device_a` is immediately available on the GPU.
cudaMallocHost(&host_a, size); // `host_a` is immediately available on CPU, and is page-locked, or pinned.

initializeOnHost(host_a, N);   // No CPU page faulting since memory is already allocated on the host.

// `cudaMemcpy` takes the destination, source, size, and a CUDA-provided variable for the direction of the copy.
cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);

kernel<<<blocks, threads, 0, someStream>>>(device_a, N);

// `cudaMemcpy` can also copy data from device to host.
cudaMemcpy(host_a, device_a, size, cudaMemcpyDeviceToHost);

verifyOnHost(host_a, N);

cudaFree(device_a);
cudaFreeHost(host_a);          // Free pinned memory like this.

Using Streams to Overlap Data Transfers and Code Execution

cudaMemcpyAsync
- 仅对主机而言为异步
- 默认情况下，在默认流中执行，对于在 GPU 上执行的其他 CUDA 操作而言，该执行操作为阻碍操作
- 将非默认流看作可选的第 5 个参数，可以与其他 CUDA 操作并发执行

int N = 2<<24;
int size = N * sizeof(int);

int *host_array;
int *device_array;

cudaMallocHost(&host_array, size);               // Pinned host memory allocation.
cudaMalloc(&device_array, size);                 // Allocation directly on the active GPU device.

initializeData(host_array, N);                   // Assume this application needs to initialize on the host.

const int numberOfSegments = 4;                  // This example demonstrates slicing the work into 4 segments.
int segmentN = N / numberOfSegments;             // A value for a segment's worth of `N` is needed.
size_t segmentSize = size / numberOfSegments;    // A value for a segment's worth of `size` is needed.

// For each of the 4 segments...
for (int i = 0; i < numberOfSegments; ++i)
{
  // Calculate the index where this particular segment should operate within the larger arrays.
  segmentOffset = i * segmentN;

  // Create a stream for this segment's worth of copy and work.
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  // Asynchronously copy segment's worth of pinned host memory to device over non-default stream.
  cudaMemcpyAsync(&device_array[segmentOffset],  // Take care to access correct location in array.
                  &host_array[segmentOffset],    // Take care to access correct location in array.
                  segmentSize,                   // Only copy a segment's worth of memory.
                  cudaMemcpyHostToDevice,
                  stream);                       // Provide optional argument for non-default stream.

  // Execute segment's worth of work over same non-default stream as memory copy.
  kernel<<<number_of_blocks, threads_per_block, 0, stream>>>(&device_array[segmentOffset], segmentN);

  // `cudaStreamDestroy` will return immediately (is non-blocking), but will not actually destroy stream until
  // all stream operations are complete.
  cudaStreamDestroy(stream);
}

Grid Size Work Amount Mismatch

线程执行时不会尝试访问超出范围的数据元素

__global__ some_kernel(int N)
{
  int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if (idx < N) // Check to make sure `idx` maps to some value within `N`
  {
    // Only do work if it does
  }
}

Data Sets Larger than the Grid

网格跨度循环

__global void kernel(int *a, int N)
{
  int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
  int gridStride = gridDim.x * blockDim.x;

  for (int i = indexWithinTheGrid; i < N; i += gridStride)
  {
    // do work on a[i];
  }
}

Error Handling

cudaError_t err;
err = cudaMallocManaged(&a, N)                    // Assume the existence of `a` and `N`.

if (err != cudaSuccess)                           // `cudaSuccess` is provided by CUDA.
{
  printf("Error: %s\n", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA.
}

/*
 * This launch should cause an error, but the kernel itself
 * cannot return it.
 */

someKernel<<<1, -1>>>();  // -1 is not a valid number of threads.

cudaError_t err;
err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
if (err != cudaSuccess)
{
  printf("Error: %s\n", cudaGetErrorString(err));
}

CUDA Error Handling Function

创建一个包装 CUDA 函数调用的宏

#include <stdio.h>
#include <assert.h>

inline cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

int main()
{

/*
 * The macro can be wrapped around any function returning
 * a value of type `cudaError_t`.
 */

  checkCuda( cudaDeviceSynchronize() )
}

Grids and Blocks of 2 and 3 Dimensions

可以将网格和线程块定义为最多具有3个维度。定义二维或三维网格或线程块，可以使用 CUDA 的 dim3 类型

1
2
3

dim3 threads_per_block(16, 16, 1);
dim3 number_of_blocks(16, 16, 1);
someKernel<<<number_of_blocks, threads_per_block>>>();

someKernel 内部的变量 gridDim.x、gridDim.y、blockDim.x 和 blockDim.y 均将等于 16

Profile an Application with nvprof

1 2	nvcc -arch=sm_70 -o single-thread-vector-add 01-vector-add/01-vector-add.cu -run nvprof ./single-thread-vector-add

Querying GPU Device Properties

int deviceId;
cudaGetDevice(&deviceId);

cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceId);

/*
 * `props` now contains several properties about the current device.
 */

int computeCapabilityMajor = props.major;
int computeCapabilityMinor = props.minor;
int multiProcessorCount = props.multiProcessorCount;
int warpSize = props.warpSize;

Asynchronous Memory Prefetching

异步内存预取，来减少页错误和按需内存迁移成本

int deviceId;
cudaGetDevice(&deviceId);                                         // The ID of the currently active GPU device.

cudaMemPrefetchAsync(pointerToSomeUMData, size, deviceId);        // Prefetch to GPU device.
cudaMemPrefetchAsync(pointerToSomeUMData, size, cudaCpuDeviceId); // Prefetch to host. `cudaCpuDeviceId` is a
                                                                  // built-in CUDA variable.

Creating, Utilizing, and Destroying Non-Default CUDA Streams

CUDA 流行为
- 给定流中的操作会按序执行
- 就不同非默认流中的操作而言，无法保证其会按彼此之间的任何特定顺序执行
- 阻碍其他流的运行直至其自身已运行完毕

cudaStream_t stream;       // CUDA streams are of type `cudaStream_t`.
cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`.

someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>(); // `stream` is passed as 4th EC argument.

cudaStreamDestroy(stream); // Note that a value, not a pointer, is passed to `cudaDestroyStream`.

GPU Programming Model & Hardware Mapping

Model	Hardware
Thread	SP aka CUDA Core
Block	SM / SMX
Grid	GPU aka Device
Global Memory	DRAM
Shared Memory	SM内部存储

注意： Thread不是实际的执行单元

Warp

每一个Warp有32条Thread
Sharing instructions
Dynamically scheduled by SM
Executed when operands ready
在编写程序的过程中不涉及到Warp，即对程序员透明

潜在的性能损失 - Warp Divergent