CUDA Programming¶
Enter your name and student ID.
- Name:
- Student ID:
1. CUDA¶
- CUDA is an extension to C++ specific to NVIDIA GPUs
- It is the most basic, native programming model for NVIDIA GPUs
2. Compilers¶
- We use NVIDIA CUDA Toolkit (
nvcc
) for CUDA compilers - LLVM ver. 18.1.8 (
clang
andclang++
) and NVIDA's C/C++ compilers (nvc
andnvc++
) we used for OpenMP also support CUDA, but they fail to compile some of our code, so we stick to more traditionalnvcc
2-1. Set up NVIDIA CUDA and HPC SDK¶
Execute this before you use NVIDIA HPC SDK
export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH
export PATH=/usr/local/cuda/bin:$PATH
- Check if it works
- make sure the full path of nvcc is shown as
/usr/local/...
, not/opt/nvidia/...
- make sure the full path of nvcc is shown as
- We do not recommend nvc/nvc++ for this exercise, but you might give them a try if you like
which nvcc
which nvc
which nvc++
nvcc --version
nvc --version
2-2. LLVM¶
- We do not recommend it for this exercise, but you might give them a try if you like
- Execute this before you use LLVM
export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH
- Check if it works (check if full paths of clang/clang++ are shown)
which clang
which clang++
clang --version
3. Check host and GPU¶
- First check if you are using the right host, tauleg000, not taulec
hostname
hostname | grep tauleg || echo "Oh, you are not on the right host, access https://tauleg.zapto.org/ instead"
- Check if GPU is alive by nvidia-smi
- Do
nvidia-smi --help
or see manual (man nvidia-smi
on terminal) for more info
nvidia-smi
%%writefile cuda_hello.cu
#include <assert.h>
#include <stdio.h>
__global__ void cuda_thread_fun(int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
if (i < n) {
printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 100);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);
// launch a kernel
cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n);
// wait for them to complete
cudaDeviceSynchronize();
return 0;
}
nvcc -o cuda_hello cuda_hello.cu
./cuda_hello
- You should see 100 lines of "hello I am CUDA thread ??? out of 128"
- Alternatively, you can have a source file with an ordinary C++ extension
.cc
(or.cpp
) and give-x cu
. - It is useful when you want to have a single source file for OpenMP and CUDA programs
ln -sf cuda_hello.cu cuda_hello.cc
nvcc -o cuda_hello -x cu cuda_hello.cc
./cuda_hello
4-2. With nvc++ (NVIDIA HPC SDK C++ compiler)¶
- Just to demonstrate
nvc++
supports CUDA, too
nvc++ -Wall -o cuda_hello cuda_hello.cu
nvc++ -Wall -o cuda_hello -x cu cuda_hello.cc
4-3. With clang++ (LLVM)¶
- Just to demonstrate
clang++
supports CUDA, too
clang++ -Wall -o cuda_hello cuda_hello.cu -L/usr/local/cuda/lib64 -lcudart
ln -sf cuda_hello.cu cuda_hello.cc
clang++ -Wall -o cuda_hello -x cu cuda_hello.cc -L/usr/local/cuda/lib64 -lcudart
5. CUDA kernel¶
- The most basic concept of CUDA programming is a CUDA kernel
- Syntactically, a CUDA kernel is a
void
function with__global__
keyword attached to it
__global__ void cuda_thread_fun(int n) { ... }
- A CUDA kernel describes what a single CUDA thread does
- You launch a number of CUDA threads all executing the same kernel by
kernel_func<<<num_of_blocks,num_of_threads_per_block>>>(...);
- We have already seen this in the above code
- See 2. Programming model section for reference
5-1. You'd better always check errors¶
It's not only CUDA programming in which you are strongly advised to check errors after each operation that could potentially go wrong
Just like many C programming APIs (unlike Python scripting, for example), calling CUDA APIs and launching CUDA kernels silently return if something went wrong
You could save a huge amount of time by checking errors
- every time you launch a CUDA kernel and
- every time you call a CUDA API
Here is the same piece of code with checking errors
%%writefile cuda_hello_chk.cu
#include <assert.h>
#include <stdio.h>
/*
you'd better spend time on making sure you always check errors ...
*/
void check_api_error_(cudaError_t e,
const char * msg, const char * file, int line) {
if (e) {
fprintf(stderr, "%s:%d:error: %s %s\n",
file, line, msg, cudaGetErrorString(e));
exit(1);
}
}
#define check_api_error(e) check_api_error_(e, #e, __FILE__, __LINE__)
void check_launch_error_(const char * msg, const char * file, int line) {
cudaError_t e = cudaGetLastError();
if (e) {
fprintf(stderr, "%s:%d:error: %s %s\n",
file, line, msg, cudaGetErrorString(e));
exit(1);
}
}
#define check_launch_error(exp) do { exp; check_launch_error_(#exp, __FILE__, __LINE__); } while (0)
__global__ void cuda_thread_fun(int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
if (i < n) {
printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 100);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n)));
check_api_error(cudaDeviceSynchronize());
return 0;
}
nvcc -o cuda_hello_chk cuda_hello_chk.cu
# nvc++ -Wall -o cuda_hello_chk cuda_hello_chk.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_chk cuda_hello_chk.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_hello_chk
- I factored out the error-checking code into a header file
"cuda_util.h"
and included it in the directory (check it from the left menu) - The following code is a more concise version using the header file
%%writefile cuda_hello_hdr_chk.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
if (i < n) {
printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 100);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n)));
check_api_error(cudaDeviceSynchronize());
return 0;
}
nvcc -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu
# nvc++ -Wall -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_hello_hdr_chk
6. The number of CUDA threads launched¶
- You specify the number of threads launched by the two parameters in <<<...,...>>>, like
kernel_func<<<num_of_blocks,num_of_threads_per_block>>>(...);
- It will create (num_of_blocks * num_of_threads_per_block) threads in total.
- More precisely, it creates num_of_blocks thread blocks, each of which has num_of_threads_per_block threads.
- It is natural to wonder why you need to specify two parameters instead of just one parameter (the total number of threads) and how to choose num_of_threads_per_block.
- For now, just know that a thread block is the unit of scheduling
- A GPU device fetches a single block at a time and dispatches it to a particular streaming multiprocessor (SM)
- Remember that a single SM is like a CPU core; a single GPU device has a number of SMs just like a single CPU has a number of cores.
Problem 1 : Change the number of threads per block¶
Change the arguments of the following command line in various ways and see what happens
./cuda_hello_hdr_chk 10 3
Answer for trivial work omitted
7. Thread ID¶
7-1. One-dimensional ID¶
- Just like OpenMP, CUDA provides a means for a thread to know its ID as well as the total number of threads launched together
- They are obtained from builtin variables
- Let's say you invoked a kernel with
f<<<12,34>>>(...);
you create 12 thread blocks having 34 threads each (408 threads in total).
gridDim.x
gives the number of thread blocks ($= 12$)blockDim.x
gives the number of threads in a thread block ($= 34$)note: "grid" is the CUDA terminology to mean all the launched thread blocks (a CUDA thread $\in$ thread block $\in$ the entire grid)
blockIdx.x
gives the block ID within the grid ($\in [0,12)$)threadIdx.x
gives the thread ID within a thread block ($\in [0,34)$)If you want to get a single thread ID between 0 to 407 and the total number of threads, you get them by
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
You have seen them in the above example.
See 2.1 Kernels for reference
7-2. Two- or three-dimensional ID¶
Each of the above four variables can actually have up to three elements, allowing you to view blocks and threads within a block arranged in an one-, two- or three-dimensional space.
You specify them accordingly when you call a kernel, for which you use a variable of type
dim3
instead of an integer, to specify up to three numbersSee 2.2 Thread Hierarchy for reference
%%writefile cuda_hello_2d.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(int n) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int nthreads_x = gridDim.x * blockDim.x;
int nthreads_y = gridDim.y * blockDim.y;
int g = x + nthreads_y * y;
if (g < n) {
printf("hello I am CUDA thread (%d,%d) of (%d,%d)\n",
x, y, nthreads_x, nthreads_y);
}
}
int isqrt(int n) {
int i;
for (i = 0; i * i < n; i++) ;
return i;
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 40);
int nx = isqrt(n);
int ny = (n + nx - 1) / nx;
int thread_block_sz_x = (argc > 2 ? atoi(argv[2]) : 2);
int thread_block_sz_y = (argc > 3 ? atoi(argv[3]) : 3);
int n_thread_blocks_x = (nx + thread_block_sz_x - 1) / thread_block_sz_x;
int n_thread_blocks_y = (ny + thread_block_sz_y - 1) / thread_block_sz_y;
printf("(%d * %d) threads/block * (%d * %d) blocks\n",
thread_block_sz_x, thread_block_sz_y,
n_thread_blocks_x, n_thread_blocks_y);
dim3 nb(n_thread_blocks_x, n_thread_blocks_y);
dim3 tpb(thread_block_sz_x, thread_block_sz_y);
check_launch_error((cuda_thread_fun<<<nb,tpb>>>(n)));
check_api_error(cudaDeviceSynchronize());
return 0;
}
nvcc -o cuda_hello_2d cuda_hello_2d.cu
# nvc++ -Wall -o cuda_hello_2d cuda_hello_2d.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_2d cuda_hello_2d.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_hello_2d
Problem 2 : Specify 2D thread blocks and grids¶
- Change the arguments of the following command line in various ways and see what happens
./cuda_hello_2d 40 2 3
Answer for trivial work omitted
8. Passing data between host (CPU) and device (GPU)¶
GPU is a device separate from a host CPU
As such, CPU and GPU do not share memory; you need to explicitly pass data by calling APIs (this is changing but practically remains true for a while)
One simplest way to pass data from a host to device is arguments to a kernel function, but
- it cannot be used for device -> host (recall that kernel functions are always void)
- it is limited to values passed by "call-by-value"; you cannot pass pointers along with values pointed to by them
For anything other than passing arguments by call-by-values, you should use
cudaMalloc
andcudaMemcpy
See 3.2.2. Device Memory for reference
8-1. cudaMalloc¶
void * p;
check_api_error(cudaMalloc(&p, size));
allocates
size
bytes of memory on device andreturns an address valid on the device (not valid on the host) to variable
p
remember that this function should be called on host; no functions are provided in CUDA API for CUDA threads to dynamically allocate memory along the way
8-2. cudaMemcpy¶
- host -> device
check_api_error(cudaMemcpy(p_dev, p_host, size, cudaMemcpyHostToDevice));
- device -> host
check_api_error(cudaMemcpy(p_host, p_dev, size, cudaMemcpyDeviceToHost));
- the first argument is always the destination
- p_dev should be an address on device (i.e., that has been allocated by
cudaMalloc
)
8-3. cudaFree¶
check_api_error(cudaFree(dev_p));
- frees memory allocated by cudaMalloc
8-4. You cannot access malloc-allocated region on the device¶
The following code demonstrates that if the device code accesses a region allocated by malloc (or any other host memory including stacks and global variables), you get a segmentation fault on the device
In practice, you never pass the pointer to malloc-allocated area to a kernel
%%writefile cuda_dev_segfault.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c = (long *)malloc(sizeof(long) * n);
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
check_api_error(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c[i]);
}
free(c);
return 0;
}
nvcc -o cuda_dev_segfault cuda_dev_segfault.cu
# nvc++ -Wall -o cuda_dev_segfault cuda_dev_segfault.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_segfault cuda_dev_segfault.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_dev_segfault
8-5. You cannot access cudaMalloc-allocated region on the host¶
- The following code demonstrates the oppssite; if the host code accesses a region allocated by
cudaMalloc
, you get a segmentation fault on the host
%%writefile cuda_host_segfault.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c_dev[i]);
}
check_api_error(cudaFree(c_dev));
return 0;
}
nvcc -o cuda_host_segfault cuda_host_segfault.cu
# nvc++ -Wall -o cuda_host_segfault cuda_host_segfault.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_host_segfault cuda_host_segfault.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_host_segfault
- So, to get a result computed on the device back to the host, call
cudaMemcpy
Problem 3 : Getting the result back from the device¶
- Add an appropriate call to
cudaMemcpy
to the following code, so it correctly prints the values computed on the device (i.e., c[i] = i)
%%writefile cuda_dev_to_host.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c = (long *)malloc(sizeof(long) * n);
long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c[i]);
}
free(c);
check_api_error(cudaFree(c_dev));
return 0;
}
nvcc -o cuda_dev_to_host cuda_dev_to_host.cu
# nvc++ -Wall -o cuda_dev_to_host cuda_dev_to_host.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_to_host cuda_dev_to_host.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_dev_to_host
%%writefile cuda_dev_to_host_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c = (long *)malloc(sizeof(long) * n);
long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(c, c_dev, sizeof(long) * n, cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c[i]);
}
free(c);
check_api_error(cudaFree(c_dev));
return 0;
}
nvcc -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu
# nvc++ -Wall -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_dev_to_host_ans
9. Unified Memory¶
- Unified Memory is a part of memory accessible both by the host and device
- Values written to it by the host are automatically visible to the device and vice versa
- Therefore with unified memory you do not have to call
cudaMemcpy
to move data between host and GPU - All you need to master is
cudaMallocManaged
, which you call in place ofcudaMalloc
- You get a pointer that is valid both on CPU and GPU
Problem 4 : Use Unified Memory¶
- Change the following program so that it uses
cudaMallocManaged
instead ofmalloc
- Make other changes as you find them necessary
%%writefile cuda_malloc_managed.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c = (long *)malloc(sizeof(long) * n);
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
check_api_error(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c[i]);
}
free(c);
return 0;
}
nvcc -o cuda_malloc_managed cuda_malloc_managed.cu
# nvc++ -Wall -o cuda_malloc_managed cuda_malloc_managed.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_malloc_managed cuda_malloc_managed.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_malloc_managed 10 3
%%writefile cuda_malloc_managed_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
p[i] = i * i;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 10);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
long * c;
check_api_error(cudaMallocManaged(&c, sizeof(long) * n));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
check_api_error(cudaDeviceSynchronize());
for (int i = 0; i < n; i++) {
printf("c[%d] = %ld\n", i, c[i]);
}
check_api_error(cudaFree(c));
return 0;
}
nvcc -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu
# nvc++ -Wall -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_malloc_managed_ans
10. CUDA device memory model¶
memory blocks allocated by
cudaMalloc
are visiable to (shared by) all threads and called global memorythey persist on device until you release them by cudaFree (or the process finishes), so they can be used not only to pass values between device and host, but also to pass values between different kernel calls (without moving values back and forth between host and device each time you call a kernel)
See Memory Hierarchy for reference
11. Race condition and atomic operation¶
As threads launched in a single kernel call run concurrently, they are subject to the same race condition as OpenMP threads
That is, if two threads access the same variable (or the same array element) and at least one of them is a write, there is a race and the program almost certainly has a bug
In the following program, each thread increments a variable by one; it nevertheles does not print the number of threads launched and prints unpredictable results each time executed.
%%writefile cuda_race.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(unsigned long long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
*p = *p + 1;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 1000);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
unsigned long long c;
unsigned long long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
check_api_error(cudaFree(c_dev));
printf("c = %lu\n", c);
return 0;
}
nvcc -o cuda_race cuda_race.cu
# nvc++ -Wall -o cuda_race cuda_race.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_race cuda_race.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_race
Problem 5 : Observe race condition¶
Execute the above program many times and observe the results; try changing parameters.
./cuda_race 1000 64
Answer for trivial work omitted
- OpenMP had three basic tools --- critical, atomic and reduction --- to resolve race conditions depending on the situation.
- Roughly, CUDA only has an analogue to atomic and does not have critical or reduction.
11-1. Atomic add¶
- CUDA has
atomicAdd(T* p, T x);
function for various types of T.
It performs *p = *p + x
atomically, meaning that it is guaranteed that *p
is not updated between the point *p
is read and the point *p
is written.
- See atomicAdd for reference
Problem 6 : Use atomicAdd
¶
- Change the following program to resolve the race condition using
atomicAdd
and make sure the result always matches the number of threads launched.
%%writefile cuda_race_atomic_add.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(unsigned long long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
*p = *p + 1;
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 1000);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
unsigned long long c;
unsigned long long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
check_api_error(cudaFree(c_dev));
printf("c = %lu\n", c);
return 0;
}
- To compile programs using
atomicAdd
, you need to give--generate-code arch=compute_80,code=sm_80
tonvcc
--generate-code
specifies which GPU architectures/instruction setnvcc
generates code for, so it might affect generated code in other ways including performance
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_race_atomic_add cuda_race_atomic_add.cu
# nvc++ -Wall -gpu=cc80 -o cuda_race_atomic_add cuda_race_atomic_add.cu
# clang++ -Wall -Wno-unknown-cuda-version --cuda-gpu-arch=sm_80 -o cuda_race_atomic_add cuda_race_atomic_add.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_race_atomic_add 10000 64
./cuda_race_atomic_add 100000 64
%%writefile cuda_race_atomic_add_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
__global__ void cuda_thread_fun(unsigned long long * p, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
atomicAdd(p, 1L);
}
}
int main(int argc, char ** argv) {
int n = (argc > 1 ? atoi(argv[1]) : 1000);
int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
unsigned long long c;
unsigned long long * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
check_api_error(cudaFree(c_dev));
printf("c = %lu\n", c);
return 0;
}
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu
# nvc++ -Wall -gpu=cc80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version --cuda-gpu-arch=sm_80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_race_atomic_add_ans 10000 64
./cuda_race_atomic_add_ans 100000 64
12. Barrier synchronization of threads¶
- Recent CUDA has the notion of cooperative groups, with which you can build a barrier synchronization between threads
- setup
#include <cooperative_groups.h>
namespace cg = cooperative_groups; // save typing
- create data representing a grouup
cg::grid_group g = cg::this_grid(); // all threads
- perform barrier synchronization when necessary (ensure no threads execute
<after>
until all threads finish<before>
)
<before>
g.sync();
<after>
- You need to launch such kernels by
void * args[] = { a0, a1, ... };
cudaLaunchCooperativeKernel((void *)f, nb, bs, args);
instead of
f<<<nb,bs>>>(a0, a1, ...);
- See Cooperative Groups for reference
Problem 7 : Use barrier synchronization¶
Change the following program sum_array()
so that it correctly outputs the sum of the array by implementing reduction on barrier synchronization.
%%writefile cuda_sum.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
#include <cooperative_groups.h>
//using namespace cooperative_groups;
// Alternatively use an alias to avoid polluting the namespace with collective algorithms
namespace cg = cooperative_groups;
__global__ void sum_array(double * c, long n) {
// should return c[0] + c[1] + ... + c[n-1] in c[0]
// you can destroy other elements of the array
cg::grid_group g = cg::this_grid();
long i = g.thread_rank();
}
int main(int argc, char ** argv) {
long n = (argc > 1 ? atoi(argv[1]) : 10000);
int threads_per_block = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + threads_per_block - 1) / threads_per_block;
double * c = (double *)malloc(sizeof(double) * n);
for (long i = 0; i < n; i++) {
c[i] = 1.0;
}
double * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(double) * n));
check_api_error(cudaMemcpy(c_dev, c, sizeof(double) * n, cudaMemcpyHostToDevice));
void * args[2] = { (void *)&c_dev, (void *)&n };
check_api_error(cudaLaunchCooperativeKernel((void*)sum_array,
n_thread_blocks,
threads_per_block,
args));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(c, c_dev, sizeof(double) * n, cudaMemcpyDeviceToHost));
check_api_error(cudaFree(c_dev));
printf("sum = %f\n", c[0]);
assert(c[0] == n);
return 0;
}
nvcc -o cuda_sum cuda_sum.cu
# nvc++ -Wall -o cuda_sum cuda_sum.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_sum cuda_sum.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_sum
%%writefile cuda_sum_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"
#include <cooperative_groups.h>
//using namespace cooperative_groups;
// Alternatively use an alias to avoid polluting the namespace with collective algorithms
namespace cg = cooperative_groups;
__global__ void sum_array(double * c, long n) {
// should return c[0] + c[1] + ... + c[n-1] in c[0]
// you can destroy other elements of the array
cg::grid_group g = cg::this_grid();
long i = g.thread_rank();
long h;
for (int m = n; m > 1; m = h) {
h = (m + 1) / 2;
if (i + h < m) {
c[i] += c[i + h];
}
g.sync();
}
}
int main(int argc, char ** argv) {
long n = (argc > 1 ? atoi(argv[1]) : 10000);
int threads_per_block = (argc > 2 ? atoi(argv[2]) : 64);
int n_thread_blocks = (n + threads_per_block - 1) / threads_per_block;
double * c = (double *)malloc(sizeof(double) * n);
for (long i = 0; i < n; i++) {
c[i] = 1.0;
}
double * c_dev;
check_api_error(cudaMalloc(&c_dev, sizeof(double) * n));
check_api_error(cudaMemcpy(c_dev, c, sizeof(double) * n, cudaMemcpyHostToDevice));
void * args[2] = { (void *)&c_dev, (void *)&n };
check_api_error(cudaLaunchCooperativeKernel((void*)sum_array,
n_thread_blocks,
threads_per_block,
args));
check_api_error(cudaDeviceSynchronize());
check_api_error(cudaMemcpy(c, c_dev, sizeof(double) * n, cudaMemcpyDeviceToHost));
check_api_error(cudaFree(c_dev));
printf("sum = %f\n", c[0]);
assert(c[0] == n);
return 0;
}
nvcc -o cuda_sum_ans cuda_sum_ans.cu
# nvc++ -Wall -o cuda_sum_ans cuda_sum_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_sum_ans cuda_sum_ans.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_sum_ans
13. Visualizing threads executing on the device¶
- When you call a kernel (function f) with
f<<<nb,bs>>>();
it creates (nb * bs) CUDA threads.
More precisely, it creates nb thread blocks, each of which has bs CUDA threads.
The following code is a program that records how threads are executed on GPU.
It creates many threads repeating a trivial (useless) computation x = a * x + b many times.
Each thread occasionally records the clock to record when and where these threads progress over time.
Specifically,
./cuda_sched_rec NB BS N M
creates approximately (NB $\times$ BS) threads, with BS threads in each thread block
Each thread repeats x = A x + B, ($N \times M$) times.
Each thread records the clock $N$ times (every $M$ iterations).
At the end of execution, it dumps the results in the following format for each line.
thread=<idx> x=<ans> sm0=<starting SM> sm1=<ending SM> t0 t1 t2 ... t_{n-1}
%%writefile cuda_sched_rec.cu
#include <assert.h>
#include <stdio.h>
// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"
// record of execution
typedef struct {
double x; // a (meaningless) answer
int sm[2]; // SM on which a thread got started/ended
} record_t;
/* this thread repeats x = a x + b (N * M) times.
it records the clock N times (every M iterations of x = a x + b)
to array T.
final result of x = a x + b, as well as SM each thread was executed
on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
long * T, long n, long m,
int nthreads) {
// my thread index
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= nthreads) return;
// initial value (not important)
double x = idx;
// where clocks are recorded
T = &T[idx * n];
// record starting SM
R[idx].sm[0] = get_smid();
// main thing. repeat a x + b many times,
// occasionally recording the clock
for (long i = 0; i < n; i++) {
T[i] = clock64();
for (long j = 0; j < m; j++) {
x = a * x + b;
}
}
// record ending SM (must be = sm0)
R[idx].sm[1] = get_smid();
// record result, just so that the computation is not
// eliminated by the compiler
R[idx].x = x;
}
void dump(record_t * R, long * T, long nthreads, long M) {
long t0 = LONG_MAX;
long k = 0;
assert(nthreads * M > 0);
// find min clock
for (long idx = 0; idx < nthreads; idx++) {
for (long j = 0; j < M; j++) {
t0 = (T[k] < t0 ? T[k] : t0);
k++;
}
}
assert(t0 < LONG_MAX);
k = 0;
for (long idx = 0; idx < nthreads; idx++) {
printf("thread=%ld x=%f sm0=%u sm1=%u",
idx, R[idx].x, R[idx].sm[0], R[idx].sm[1]);
for (long j = 0; j < M; j++) {
printf(" %ld", T[k]);
k++;
}
printf("\n");
}
}
/* usage
./cuda_sched N_THREAD_BLOCKS THREADS_PER_BLOCK N M S A B
creates about N_THREAD_BLOCKS * THREADS_PER_BLOCK threads,
with THREADS_PER_BLOCK threads in each thread block.
each thread repeats x = A x + B (N * M) times.
shm_sz is the shared memory allocated for each thread block
(just to control the number of thread blocks simultaneously
scheduled on an SM). shared memory is not actually used at all.
*/
int main(int argc, char ** argv) {
int i = 1;
int n_thread_blocks = (argc > i ? atoi(argv[i]) : 3); i++;
int threads_per_block = (argc > i ? atoi(argv[i]) : 64); i++;
long M = (argc > i ? atoll(argv[i]) : 100); i++;
long N = (argc > i ? atoll(argv[i]) : 100); i++;
int shm_sz = (argc > i ? atoi(argv[i]) : 0); i++;
int D = (argc > i ? atoll(argv[i]) : 1); i++;
double a = (argc > i ? atof(argv[i]) : 0.99); i++;
double b = (argc > i ? atof(argv[i]) : 1.00); i++;
printf("%d blocks * %d threads/block\n", n_thread_blocks, threads_per_block);
int nthreads = n_thread_blocks * threads_per_block;
// allocate record_t array (both on host and device)
long R_sz = sizeof(record_t) * nthreads;
record_t * R = (record_t *)calloc(R_sz, 1);
record_t * R_dev;
check_api_error(cudaMalloc(&R_dev, R_sz));
check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));
// allocate clock array (both on host and device)
long T_sz = sizeof(long) * M * nthreads;
long * T = (long *)calloc(T_sz, 1);
long * T_dev;
check_api_error(cudaMalloc(&T_dev, T_sz));
check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));
int shm_elems = shm_sz / sizeof(double);
int shm_size = shm_elems * sizeof(double);
// call the kernel
check_launch_error((cuda_thread_fun<<<n_thread_blocks,threads_per_block,shm_size>>>
(a, b, R_dev, T_dev, M, N, nthreads)));
check_api_error(cudaDeviceSynchronize());
// get back the results and clocks
check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
// dump the for visualization
dump(R, T, nthreads, M);
return 0;
}
- Read the code carefully and understand what it is doing.
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec cuda_sched_rec.cu
./cuda_sched_rec 2 32 10 100 | head -10
The following python code parses and visualizes the output of cuda_sched_rec.
The code is shown below for your information; you don't have to understand how it works.
Let's visualize a few configurations.
13-1. one thread ($1 \times 1$)¶
./cuda_sched_rec 1 1 100 1000 > cs_1_1.dat
import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_1_1.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))
- you can change
start_t
andend_t
to zoom into a narrower time interval and changestart_thread
andend_thread
to zoom into a range of threads - or, you can open
sched.svg
generated along with the visualization and magnify anywhere you want to look into, either by the browser or any SVG viewer on your PC
13-2. 1 thread block $\times$ $T$ threads/block ($1 \times T$)¶
- play with changing $T$ to other values
T=8
./cuda_sched_rec 1 ${T} 100 1000 > cs_1_T.dat
import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_1_T.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))
Observe that they are always executed on the same SM. You are not utilizing multiple SMs at all.
Compare the time for $T=1$ and $T=8$; incease $T$ until the time starts to increase.
Increase $T$ until the execution time starts to increase almost linearly with $T$. Why doesn't it immediately happen with $T$>1?
There is a hardwired limit on the number of threads per block. Try to find it and then confirm it with Google.
With a modest value of $T$ (say 100), zoom in at either end of the execution and observe whether there is any difference on when they started or finished execution. If you look carefully, you will notice that a number of consecutive threads start and end exactly the same clock. Those threads are called a warp and they share an instruction pointer. It is very analogous to SIMD instruction found in CPUs that apply the same operation on multiple operands. Guess the number of threads of a warp from the experimental results and confirm it by Google.
13-4. many thread blocks $\times$ many threads/block ($B \times T$)¶
- play with changing $B$ and $T$ to other values
B=3
T=64
./cuda_sched_rec ${B} ${T} 100 1000 > cs_B_T.dat
import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_B_T.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"), show_every=1)
show_every
parameter specifies that only visualize every this number of threads- it reduces the time to visualize when the number of threads is so large
- Try to find the maximum number of threads that does not increase the execution time.
- use
show_every=32
to reduce the time for visualization
- use
14. Thread blocks¶
A thread block is the unit of dispatching to a streaming multiprocessor (SM), which is like a physical core of a CPU. Threads within a thread block are always dispatched together to the same SM and once dispatched stay on the same SM until finished.
see CUDA C++ Programming Guide: A Scalable Programming Model
An SM is a highly multithreaded processor, which can accommodate many threads at the same time and interleave them by hardware. For example, it can easily hold, say, 500 threads and interleave their execution without involving software. In terms of hardware capability, it is somewhat similar to simultaneous multithreading (SMT) of CPUs. The degree of multithreading is very different, however; Intel CPUs normally support only two hardware threads (virtual cores) on each physical core. Moreover, software (either operating system or user-level software) needs to designate which virtual core you want to run a thread on. In a sense, CPU exposes each virtual core as a single-threaded machine. If you put more than one OpenMP (OS-level) thread on the same virtual core, software (i.e., OS) switches between them from time to time. A streaming multiprocessor of a GPU, in contrast, is a machine that literally takes many threads and concurrently executes them by hardware. Determining the SM a thread block executes on is done by hardware.
How many thread blocks are scheduled on an SM at the same time? It depends; it depends on how much "resources" a single thread block requires. Here, "resources" mean two things.
- registers
- shared memory (see below)
Registers are used for holding local variables and intermediate results of computation. How many registers a thread block uses is not something you can reliably determine by looking at your code; it depends on the code generated by the compiler. You can know it by passing
-Xptxas -v
to nvcc and looking at the compiler message.Shared memory is a scratch-pad memory only shared within a single thread block. Physically, you can consider it to be a small fast memory attached to each SM. The name "shared memory" is clearly a misnomer; ordinary memory you get by
cudaMalloc
is shared by all threads (called "global memory"). In contrast, shared memory is, contrary to its name, shared only among threads within a single thread block. "Local memory" (as opposed to global memory) would have been a better name for it, IMO.Both registers and shared memory for a thread block are kept on physical registers/memory of an SM throughout the lifetime of the thread block. Thus, accommodating a larger number of thread blocks at the same time requires a proportionally larger amount of registers/shared memory, which is subject to the physical resource limit of an SM.
Each SM has the following physical resources.
registers | shared memory | |
---|---|---|
Pascal | 32 bit x 65536 | 64KB |
Volta | 32 bit x 65536 | up to 96KB (*) |
Ampere | 32 bit x 65536 | up to 163KB |
(*) configurable subject to L1 cache + shared memory <= 128KB and shared memory <= 96KB
By default, a thread does not use shared memory at all.
Let's observe how many registers a thread uses.
nvcc --generate-code arch=compute_80,code=sm_80 -Xptxas -v -o cuda_sched_rec cuda_sched_rec.cu
- Since the computation is very simple, register usage will not be a limiting factor for this computation.
- Also, since it does not use shared memory at all, it won't be a limiting factor either.
- Only the hardwired limit is the limiting factor.
15. Shared memory¶
- Let's use shared memory to observe how it affects the number of thread blocks simultaneously executed. You specify the size of shared memory per thread block via the third parameter of kernel call, like this.
f<<<nb,bs,S>>>();
The above kernel launch statement specifies that $S$ bytes of shared memory should be allocated to each thread block. Each SM can therefore execute only up to (SHARED_MEMORY_SIZE_PER_SM / $S$) thread blocks simultaneously.
You can get a pointer to the part of the shared memory allocated to each thread via the following strange syntax within your kernel function, though it is not necessary in our current experiment.
extern __shared__ T shmem[];
With that,
shmem
points to the start of the shared memory for the thread block. The name can be arbitrary.cuda_sched_rec.cu
is already written to take the size of the shared memory per thread block as a parameter.The following code creates $B$ thread blocks each of which has only one thread
It allocates 8KB for each thread block
B=3
S=$((8 * 1024))
./cuda_sched_rec ${B} 1 100 1000 ${S} > cs_B_1_S.dat
import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_B_1_S.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))
- Increase $S$ to find the maximum shared memory size you can allocate for each thread block
- Fix $S$ to that maximum value and increase $B$; predict when the execution time starts to increase
- Hint : Ampere architecture has 108 streaming multiprocessors (SMs), each having 164KB shared memory; each SM can simultaneously execute only as many thread blocks as will fit within its available shared memory
16. Warp¶
- Consecutively numbered 32 threads within a thread block makes a warp and they can execute only one same instruction at a time.
- That is, it's not possible, within a single cycle, for some threads to execute an instruction A while others in the same warp execute another instruction B. All the GPU can do is simply to keep some threads from executing instructions that they should not execute.
- A typical example is an "if" statement. e.g.,
if (thread_idx % 2 == 0) {
A;
} else {
B;
}
If there are any thread executing A and any thread executing B within a warp, the time the warp takes is the time to execute A plus the time to execute B.
An important performance implication is you'd better not have threads branching differently within the same warp.
In the following code, each thread executes
if ((idx / D) is even) {
for (long i = 0; i < n; i++) {
T[i] = clock64();
for (long j = 0; j < m; j++) {
x = a * x + b;
}
}
} else {
for (long i = 0; i < n; i++) {
T[i] = clock64();
for (long j = 0; j < m / 2; j++) {
x = a * x + b;
}
}
}
- Given D, the first $D$ threads (thread idx=$0, 1, ..., D-1$) execute the "then" clause, the next $D$ threads (thread idx=$D, D+1, ..., 2D-1$, the "else" clause, and so on
%%writefile cuda_sched_rec_warp.cu
#include <assert.h>
#include <stdio.h>
// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"
// record of execution
typedef struct {
double x; // a (meaningless) answer
int sm[2]; // SM on which a thread got started/ended
} record_t;
/* this thread repeats x = a x + b (N * M) times.
it records the clock N times (every M iterations of x = a x + b)
to array T.
final result of x = a x + b, as well as SM each thread was executed
on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
long * T, long n, long m,
int D,
int nthreads) {
// my thread index
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= nthreads) return;
// initial value (not important)
double x = idx;
// where clocks are recorded
T = &T[idx * n];
// record starting SM
R[idx].sm[0] = get_smid();
// main thing. repeat a x + b many times,
// occasionally recording the clock
if ((idx / D) % 2 == 0) {
for (long i = 0; i < n; i++) {
T[i] = clock64();
for (long j = 0; j < m; j++) {
x = a * x + b;
}
}
} else {
for (long i = 0; i < n; i++) {
T[i] = clock64();
for (long j = 0; j < m / 2; j++) {
x = a * x + b;
}
}
}
// record ending SM (must be = sm0)
R[idx].sm[1] = get_smid();
// record result, just so that the computation is not
// eliminated by the compiler
R[idx].x = x;
}
void dump(record_t * R, long * T, long nthreads, long M) {
long t0 = LONG_MAX;
long k = 0;
assert(nthreads * M > 0);
// find min clock
for (long idx = 0; idx < nthreads; idx++) {
for (long j = 0; j < M; j++) {
t0 = (T[k] < t0 ? T[k] : t0);
k++;
}
}
assert(t0 < LONG_MAX);
k = 0;
for (long idx = 0; idx < nthreads; idx++) {
printf("thread=%ld x=%f sm0=%u sm1=%u",
idx, R[idx].x, R[idx].sm[0], R[idx].sm[1]);
for (long j = 0; j < M; j++) {
printf(" %ld", T[k]);
k++;
}
printf("\n");
}
}
/* usage
./cuda_sched N_THREAD_BLOCKS THREADS_PER_BLOCK N M S A B
creates about N_THREAD_BLOCKS * THREADS_PER_BLOCK threads,
with THREADS_PER_BLOCK threads in each thread block.
each thread repeats x = A x + B (N * M) times.
shm_sz is the shared memory allocated for each thread block
(just to control the number of thread blocks simultaneously
scheduled on an SM). shared memory is not actually used at all.
*/
int main(int argc, char ** argv) {
int i = 1;
int n_thread_blocks = (argc > i ? atoi(argv[i]) : 3); i++;
int threads_per_block = (argc > i ? atoi(argv[i]) : 64); i++;
long M = (argc > i ? atoll(argv[i]) : 100); i++;
long N = (argc > i ? atoll(argv[i]) : 100); i++;
int shm_sz = (argc > i ? atoi(argv[i]) : 0); i++;
int D = (argc > i ? atoll(argv[i]) : 1); i++;
double a = (argc > i ? atof(argv[i]) : 0.99); i++;
double b = (argc > i ? atof(argv[i]) : 1.00); i++;
printf("%d blocks * %d threads/block\n", n_thread_blocks, threads_per_block);
int nthreads = n_thread_blocks * threads_per_block;
// allocate record_t array (both on host and device)
long R_sz = sizeof(record_t) * nthreads;
record_t * R = (record_t *)calloc(R_sz, 1);
record_t * R_dev;
check_api_error(cudaMalloc(&R_dev, R_sz));
check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));
// allocate clock array (both on host and device)
long T_sz = sizeof(long) * M * nthreads;
long * T = (long *)calloc(T_sz, 1);
long * T_dev;
check_api_error(cudaMalloc(&T_dev, T_sz));
check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));
int shm_elems = shm_sz / sizeof(double);
int shm_size = shm_elems * sizeof(double);
// call the kernel
check_launch_error((cuda_thread_fun<<<n_thread_blocks,threads_per_block,shm_size>>>
(a, b, R_dev, T_dev, M, N, D, nthreads)));
check_api_error(cudaDeviceSynchronize());
// get back the results and clocks
check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
// dump the for visualization
dump(R, T, nthreads, M);
return 0;
}
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec_warp cuda_sched_rec_warp.cu
- Execute the code with various D's (and perhaps other parameters) to visualize the effect of warps and its performance implication
- Predict what the visualization looks like
B=4
T=32
D=1
./cuda_sched_rec_warp ${B} ${T} 100 1000 0 ${D} > cs_warp.dat
import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_warp.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))
- Observe the following in the visualization
- When $D < 32$ or is not a multiple of 32, threads within a warp may take both branches (diverge)
- When $D$ is a multiple of 32 and $\geq 32$, all threads within a warp take the same branch
Problem 8 : Putting them together: calculating an integral¶
- Write a CUDA program that calculates
$$ \int \int_D \sqrt{1 - x^2 - y^2}\,dx\,dy $$
where
$$ D = \{\;(x, y)\;|\;0\leq x \leq 1, 0\leq y \leq 1, x^2 + y^2 \leq 1 \}$$
- Note: an alternative way to put it is to calculate
$$ \int_0^1 \int_0^1 f(x)\,dx\,dy $$
where
$$ f(x) = \left\{\begin{array}{ll}\sqrt{1 - x^2 - y^2} & (x^2 + y^2 \leq 1) \\ 0 & (\mbox{otherwise}) \end{array}\right. $$
Write a CUDA kernel that computes the integrand on a single point
And launch it with as many threads as the number of points you compute the integrand at
The result should be close to $\pi/6$ (1/8 of the volume of the unit ball)
Play with the number of infinitesimal intervals for integration and the number of threads so that you can observe a speedup
Measure the time not just for the entire computation, but the time of each step including cudaMalloc, cudaMemcpy to initialize variables on the device, kernel and cudaMemcpy to get the result back
Try atomicAdd as well as reduction
Play with unified memory also
Take the number of thread blocks and the number of threads per block in the command line
Compare the execution speed of OpenMP (CPU) and CUDA (GPU) in various settings
- a single CPU thread vs single CUDA thread
- a single CPU thread vs multiple CUDA threads in a single thread block
- multiple CPU threads vs multiple CUDA threads in multiple thread blocks
%%writefile cuda_integral.cu
// Write Your Code Here
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_integral cuda_integral.cu
# nvc++ -Wall -o cuda_integral cuda_integral.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_integral cuda_integral.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_integral
%%writefile cuda_integral_ans.cu
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <time.h>
#include "cuda_util.h"
double cur_time() {
struct timespec tp[1];
clock_gettime(CLOCK_REALTIME, tp);
return tp->tv_sec + tp->tv_nsec * 1.0e-9;
}
__global__ void cuda_thread_fun(int n, double xa, double ya, double dx, double dy, double * sp) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i < n && j < n) {
double x = xa + i * dx;
double y = ya + j * dy;
double z2 = 1 - x * x - y * y;
if (z2 > 0) {
atomicAdd(sp, sqrt(z2) * dx * dy);
}
}
}
int main(int argc, char ** argv) {
double xa = 0.0;
double xb = 1.0;
double ya = 0.0;
double yb = 1.0;
int n = 10000;
double dx = (xb - xa) / n;
double dy = (yb - ya) / n;
// thread configuration
int nx = n;
int ny = n;
int thread_block_sz_x = (argc > 1 ? atoi(argv[1]) : 8);
int thread_block_sz_y = thread_block_sz_x;
int n_thread_blocks_x = (nx + thread_block_sz_x - 1) / thread_block_sz_x;
int n_thread_blocks_y = (ny + thread_block_sz_y - 1) / thread_block_sz_y;
double s = 0.0;
double * s_dev;
double t0 = cur_time();
check_api_error(cudaMalloc(&s_dev, sizeof(double)));
double t1 = cur_time();
check_api_error(cudaMemcpy(s_dev, &s, sizeof(double), cudaMemcpyHostToDevice));
double t2 = cur_time();
dim3 nb(n_thread_blocks_x, n_thread_blocks_y);
dim3 tpb(thread_block_sz_x, thread_block_sz_y);
check_launch_error((cuda_thread_fun<<<nb,tpb>>>(n, xa, ya, dx, dy, s_dev)));
check_api_error(cudaDeviceSynchronize());
double t3 = cur_time();
check_api_error(cudaMemcpy(&s, s_dev, sizeof(double), cudaMemcpyDeviceToHost));
double t4 = cur_time();
printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
printf(" cudaMalloc : %f sec\n", t1 - t0);
printf(" host -> dev : %f sec\n", t2 - t1);
printf(" kernel : %f sec\n", t3 - t2);
printf(" host <- dev : %f sec\n", t4 - t3);
printf("---------------------------\n");
printf("total : %f sec\n", t4 - t0);
return 0;
}
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_integral_ans cuda_integral_ans.cu
# nvc++ -Wall -o cuda_integral_ans cuda_integral_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_integral_ans cuda_integral_ans.cu -L/usr/local/cuda/lib64 -lcudart
./cuda_integral_ans