int *a; // Note the address of `a` is passed as first argument. cudaMallocManaged(&a, size);
// Use `a` on the CPU and/or on any GPU in the accelerated system.
cudaFree(a);
Manual Memory Allocation and Copying
cudaMalloc 为 GPU 分配内存
防止 GPU 分页错误
cudaMallocHost 为 CPU 分配内存
钉固内存或锁页内存
cudaFreeHost 命令释放钉固内存
cudaMemcpy 命令均可拷贝(而非传输)内存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
int *host_a, *device_a; // Define host-specific and device-specific arrays. cudaMalloc(&device_a, size); // `device_a` is immediately available on the GPU. cudaMallocHost(&host_a, size); // `host_a` is immediately available on CPU, and is page-locked, or pinned.
initializeOnHost(host_a, N); // No CPU page faulting since memory is already allocated on the host.
// `cudaMemcpy` takes the destination, source, size, and a CUDA-provided variable for the direction of the copy. cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);
cudaMallocHost(&host_array, size); // Pinned host memory allocation. cudaMalloc(&device_array, size); // Allocation directly on the active GPU device.
initializeData(host_array, N); // Assume this application needs to initialize on the host.
constint numberOfSegments = 4; // This example demonstrates slicing the work into 4 segments. int segmentN = N / numberOfSegments; // A value for a segment's worth of `N` is needed. size_t segmentSize = size / numberOfSegments; // A value for a segment's worth of `size` is needed.
// For each of the 4 segments... for (int i = 0; i < numberOfSegments; ++i) { // Calculate the index where this particular segment should operate within the larger arrays. segmentOffset = i * segmentN;
// Create a stream for this segment's worth of copy and work. cudaStream_t stream; cudaStreamCreate(&stream);
// Asynchronously copy segment's worth of pinned host memory to device over non-default stream. cudaMemcpyAsync(&device_array[segmentOffset], // Take care to access correct location in array. &host_array[segmentOffset], // Take care to access correct location in array. segmentSize, // Only copy a segment's worth of memory. cudaMemcpyHostToDevice, stream); // Provide optional argument for non-default stream.
// Execute segment's worth of work over same non-default stream as memory copy. kernel<<<number_of_blocks, threads_per_block, 0, stream>>>(&device_array[segmentOffset], segmentN);
// `cudaStreamDestroy` will return immediately (is non-blocking), but will not actually destroy stream until // all stream operations are complete. cudaStreamDestroy(stream); }
if (idx < N) // Check to make sure `idx` maps to some value within `N` { // Only do work if it does } }
Data Sets Larger than the Grid
网格跨度循环
1 2 3 4 5 6 7 8 9 10
__global voidkernel(int *a, int N) { int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x; int gridStride = gridDim.x * blockDim.x;
for (int i = indexWithinTheGrid; i < N; i += gridStride) { // do work on a[i]; } }
Error Handling
1 2 3 4 5 6 7
cudaError_t err; err = cudaMallocManaged(&a, N) // Assume the existence of `a` and `N`.
if (err != cudaSuccess) // `cudaSuccess` is provided by CUDA. { printf("Error: %s\n", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA. }
1 2 3 4 5 6 7 8 9 10 11 12 13
/* * This launch should cause an error, but the kernel itself * cannot return it. */
someKernel<<<1, -1>>>(); // -1 is not a valid number of threads.
cudaError_t err; err = cudaGetLastError(); // `cudaGetLastError` will return the error from above. if (err != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(err)); }
/* * `props` now contains several properties about the current device. */
int computeCapabilityMajor = props.major; int computeCapabilityMinor = props.minor; int multiProcessorCount = props.multiProcessorCount; int warpSize = props.warpSize;
Asynchronous Memory Prefetching
异步内存预取,来减少页错误和按需内存迁移成本
1 2 3 4 5 6
int deviceId; cudaGetDevice(&deviceId); // The ID of the currently active GPU device.
cudaMemPrefetchAsync(pointerToSomeUMData, size, deviceId); // Prefetch to GPU device. cudaMemPrefetchAsync(pointerToSomeUMData, size, cudaCpuDeviceId); // Prefetch to host. `cudaCpuDeviceId` is a // built-in CUDA variable.
Creating, Utilizing, and Destroying Non-Default CUDA Streams
CUDA 流行为
给定流中的操作会按序执行
就不同非默认流中的操作而言,无法保证其会按彼此之间的任何特定顺序执行
阻碍其他流的运行直至其自身已运行完毕
1 2 3 4 5 6
cudaStream_t stream; // CUDA streams are of type `cudaStream_t`. cudaStreamCreate(&stream); // Note that a pointer must be passed to `cudaCreateStream`.
someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>(); // `stream` is passed as 4th EC argument.
cudaStreamDestroy(stream); // Note that a value, not a pointer, is passed to `cudaDestroyStream`.