export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH
export PATH=/usr/local/cuda/bin:$PATH

which nvcc
which nvc
which nvc++
nvcc --version
nvc --version

export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH

which clang
which clang++
clang --version

hostname
hostname | grep tauleg || echo "Oh, you are not on the right host, access https://tauleg.zapto.org/ instead"

nvidia-smi

%%writefile cuda_hello.cu
#include <assert.h>
#include <stdio.h>

__global__ void cuda_thread_fun(int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  int nthreads = gridDim.x * blockDim.x;
  if (i < n) {
    printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 100);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);

  // launch a kernel
  cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n);
  // wait for them to complete
  cudaDeviceSynchronize();
  return 0;
}

nvcc -o cuda_hello cuda_hello.cu

./cuda_hello

ln -sf cuda_hello.cu cuda_hello.cc
nvcc -o cuda_hello -x cu cuda_hello.cc

./cuda_hello

nvc++ -Wall -o cuda_hello cuda_hello.cu

nvc++ -Wall -o cuda_hello -x cu cuda_hello.cc

clang++ -Wall -o cuda_hello cuda_hello.cu -L/usr/local/cuda/lib64 -lcudart

ln -sf cuda_hello.cu cuda_hello.cc
clang++ -Wall -o cuda_hello -x cu cuda_hello.cc -L/usr/local/cuda/lib64 -lcudart

%%writefile cuda_hello_chk.cu
#include <assert.h>
#include <stdio.h>

/*
  you'd better spend time on making sure you always check errors ...
*/

void check_api_error_(cudaError_t e,
                      const char * msg, const char * file, int line) {
  if (e) {
    fprintf(stderr, "%s:%d:error: %s %s\n",
            file, line, msg, cudaGetErrorString(e));
    exit(1);
  }
}

#define check_api_error(e) check_api_error_(e, #e, __FILE__, __LINE__)

void check_launch_error_(const char * msg, const char * file, int line) {
  cudaError_t e = cudaGetLastError();
  if (e) {
    fprintf(stderr, "%s:%d:error: %s %s\n",
            file, line, msg, cudaGetErrorString(e));
    exit(1);
  }
}

#define check_launch_error(exp) do { exp; check_launch_error_(#exp, __FILE__, __LINE__); } while (0)


__global__ void cuda_thread_fun(int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  int nthreads = gridDim.x * blockDim.x;
  if (i < n) {
    printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 100);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);

  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n)));
  check_api_error(cudaDeviceSynchronize());
  return 0;
}

nvcc -o cuda_hello_chk cuda_hello_chk.cu
# nvc++ -Wall -o cuda_hello_chk cuda_hello_chk.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_chk cuda_hello_chk.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_hello_chk

%%writefile cuda_hello_hdr_chk.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

__global__ void cuda_thread_fun(int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  int nthreads = gridDim.x * blockDim.x;
  if (i < n) {
    printf("hello I am CUDA thread %d out of %d\n", i, nthreads);
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 100);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);

  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(n)));
  check_api_error(cudaDeviceSynchronize());
  return 0;
}

nvcc -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu
# nvc++ -Wall -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_hdr_chk cuda_hello_hdr_chk.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_hello_hdr_chk

./cuda_hello_hdr_chk 10 3

%%writefile cuda_hello_2d.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

__global__ void cuda_thread_fun(int n) {
  int x          = blockDim.x * blockIdx.x + threadIdx.x;
  int y          = blockDim.y * blockIdx.y + threadIdx.y;
  int nthreads_x = gridDim.x * blockDim.x;
  int nthreads_y = gridDim.y * blockDim.y;
  int g          = x + nthreads_y * y;
  if (g < n) {
    printf("hello I am CUDA thread (%d,%d) of (%d,%d)\n",
           x, y, nthreads_x, nthreads_y);
  }
}

int isqrt(int n) {
  int i;
  for (i = 0; i * i < n; i++) ;
  return i;
}

int main(int argc, char ** argv) {
  int n                 = (argc > 1 ? atoi(argv[1]) : 40);
  int nx                = isqrt(n);
  int ny                = (n + nx - 1) / nx;
  int thread_block_sz_x = (argc > 2 ? atoi(argv[2]) : 2);
  int thread_block_sz_y = (argc > 3 ? atoi(argv[3]) : 3);
  int n_thread_blocks_x = (nx + thread_block_sz_x - 1) / thread_block_sz_x;
  int n_thread_blocks_y = (ny + thread_block_sz_y - 1) / thread_block_sz_y;
  printf("(%d * %d) threads/block * (%d * %d) blocks\n",
         thread_block_sz_x, thread_block_sz_y,
         n_thread_blocks_x, n_thread_blocks_y);

  dim3 nb(n_thread_blocks_x, n_thread_blocks_y);
  dim3 tpb(thread_block_sz_x, thread_block_sz_y);
  check_launch_error((cuda_thread_fun<<<nb,tpb>>>(n)));
  check_api_error(cudaDeviceSynchronize());
  return 0;
}

nvcc -o cuda_hello_2d cuda_hello_2d.cu
# nvc++ -Wall -o cuda_hello_2d cuda_hello_2d.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_hello_2d cuda_hello_2d.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_hello_2d

./cuda_hello_2d 40 2 3

%%writefile cuda_dev_segfault.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c = (long *)malloc(sizeof(long) * n);
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
  check_api_error(cudaDeviceSynchronize());
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c[i]);
  }
  free(c);
  return 0;
}

nvcc -o cuda_dev_segfault cuda_dev_segfault.cu
# nvc++ -Wall -o cuda_dev_segfault cuda_dev_segfault.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_segfault cuda_dev_segfault.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_dev_segfault

%%writefile cuda_host_segfault.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c_dev[i]);
  }
  check_api_error(cudaFree(c_dev));
  return 0;
}

nvcc -o cuda_host_segfault cuda_host_segfault.cu
# nvc++ -Wall -o cuda_host_segfault cuda_host_segfault.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_host_segfault cuda_host_segfault.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_host_segfault

%%writefile cuda_dev_to_host.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c = (long *)malloc(sizeof(long) * n);
  long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c[i]);
  }
  free(c);
  check_api_error(cudaFree(c_dev));
  return 0;
}

nvcc -o cuda_dev_to_host cuda_dev_to_host.cu
# nvc++ -Wall -o cuda_dev_to_host cuda_dev_to_host.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_to_host cuda_dev_to_host.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_dev_to_host

%%writefile cuda_dev_to_host_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c = (long *)malloc(sizeof(long) * n);
  long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(long) * n));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(c, c_dev, sizeof(long) * n, cudaMemcpyDeviceToHost));
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c[i]);
  }
  free(c);
  check_api_error(cudaFree(c_dev));
  return 0;
}

nvcc -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu
# nvc++ -Wall -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_dev_to_host_ans cuda_dev_to_host_ans.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_dev_to_host_ans

%%writefile cuda_malloc_managed.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c = (long *)malloc(sizeof(long) * n);
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
  check_api_error(cudaDeviceSynchronize());
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c[i]);
  }
  free(c);
  return 0;
}

nvcc -o cuda_malloc_managed cuda_malloc_managed.cu
# nvc++ -Wall -o cuda_malloc_managed cuda_malloc_managed.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_malloc_managed cuda_malloc_managed.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_malloc_managed 10 3

%%writefile cuda_malloc_managed_ans.cu
#include <assert.h>
#include <stdio.h>
#include "cuda_util.h"

__global__ void cuda_thread_fun(long * p, int n) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    p[i] = i * i;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 10);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 3);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;
  long * c;
  check_api_error(cudaMallocManaged(&c, sizeof(long) * n));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c, n)));
  check_api_error(cudaDeviceSynchronize());
  for (int i = 0; i < n; i++) {
    printf("c[%d] = %ld\n", i, c[i]);
  }
  check_api_error(cudaFree(c));
  return 0;
}

nvcc -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu
# nvc++ -Wall -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_malloc_managed_ans cuda_malloc_managed_ans.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_malloc_managed_ans

%%writefile cuda_race.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

__global__ void cuda_thread_fun(unsigned long long * p, int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    *p = *p + 1;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 1000);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;

  unsigned long long c;
  unsigned long long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
  check_api_error(cudaFree(c_dev));
  printf("c = %lu\n", c);
  return 0;
}

nvcc -o cuda_race cuda_race.cu
# nvc++ -Wall -o cuda_race cuda_race.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_race cuda_race.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_race

./cuda_race 1000 64

%%writefile cuda_race_atomic_add.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

__global__ void cuda_thread_fun(unsigned long long * p, int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    *p = *p + 1;
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 1000);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;

  unsigned long long c;
  unsigned long long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
  check_api_error(cudaFree(c_dev));
  printf("c = %lu\n", c);
  return 0;
}

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_race_atomic_add cuda_race_atomic_add.cu
# nvc++ -Wall -gpu=cc80 -o cuda_race_atomic_add cuda_race_atomic_add.cu
# clang++ -Wall -Wno-unknown-cuda-version --cuda-gpu-arch=sm_80 -o cuda_race_atomic_add cuda_race_atomic_add.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_race_atomic_add 10000 64
./cuda_race_atomic_add 100000 64

%%writefile cuda_race_atomic_add_ans.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

__global__ void cuda_thread_fun(unsigned long long * p, int n) {
  int i        = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n) {
    atomicAdd(p, 1L);
  }
}

int main(int argc, char ** argv) {
  int n               = (argc > 1 ? atoi(argv[1]) : 1000);
  int thread_block_sz = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + thread_block_sz - 1) / thread_block_sz;

  unsigned long long c;
  unsigned long long * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(unsigned long long)));
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz>>>(c_dev, n)));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(&c, c_dev, sizeof(unsigned long long), cudaMemcpyDeviceToHost));
  check_api_error(cudaFree(c_dev));
  printf("c = %lu\n", c);
  return 0;
}

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu
# nvc++ -Wall -gpu=cc80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version --cuda-gpu-arch=sm_80 -o cuda_race_atomic_add_ans cuda_race_atomic_add_ans.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_race_atomic_add_ans 10000 64
./cuda_race_atomic_add_ans 100000 64

%%writefile cuda_sum.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

#include <cooperative_groups.h>

//using namespace cooperative_groups;
// Alternatively use an alias to avoid polluting the namespace with collective algorithms
namespace cg = cooperative_groups;

__global__ void sum_array(double * c, long n) {
  // should return c[0] + c[1] + ... + c[n-1] in c[0]
  // you can destroy other elements of the array
  cg::grid_group g = cg::this_grid();
  long i = g.thread_rank();
}

int main(int argc, char ** argv) {
  long n                = (argc > 1 ? atoi(argv[1]) : 10000);
  int threads_per_block = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + threads_per_block - 1) / threads_per_block;

  double * c = (double *)malloc(sizeof(double) * n);
  for (long i = 0; i < n; i++) {
    c[i] = 1.0;
  }
  double * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(double) * n));
  check_api_error(cudaMemcpy(c_dev, c, sizeof(double) * n, cudaMemcpyHostToDevice));
  void * args[2] = { (void *)&c_dev, (void *)&n };
  check_api_error(cudaLaunchCooperativeKernel((void*)sum_array,
                                              n_thread_blocks,
                                              threads_per_block,
                                              args));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(c, c_dev, sizeof(double) * n, cudaMemcpyDeviceToHost));
  check_api_error(cudaFree(c_dev));
  printf("sum = %f\n", c[0]);
  assert(c[0] == n);
  return 0;
}

nvcc -o cuda_sum cuda_sum.cu
# nvc++ -Wall -o cuda_sum cuda_sum.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_sum cuda_sum.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_sum

%%writefile cuda_sum_ans.cu
#include <assert.h>
#include <stdio.h>

#include "cuda_util.h"

#include <cooperative_groups.h>

//using namespace cooperative_groups;
// Alternatively use an alias to avoid polluting the namespace with collective algorithms
namespace cg = cooperative_groups;

__global__ void sum_array(double * c, long n) {
  // should return c[0] + c[1] + ... + c[n-1] in c[0]
  // you can destroy other elements of the array
  cg::grid_group g = cg::this_grid();
  long i = g.thread_rank();
  long h;
  for (int m = n; m > 1; m = h) {
    h = (m + 1) / 2;
    if (i + h < m) {
      c[i] += c[i + h];
    }
    g.sync();
  }
}

int main(int argc, char ** argv) {
  long n                = (argc > 1 ? atoi(argv[1]) : 10000);
  int threads_per_block = (argc > 2 ? atoi(argv[2]) : 64);
  int n_thread_blocks = (n + threads_per_block - 1) / threads_per_block;

  double * c = (double *)malloc(sizeof(double) * n);
  for (long i = 0; i < n; i++) {
    c[i] = 1.0;
  }
  double * c_dev;
  check_api_error(cudaMalloc(&c_dev, sizeof(double) * n));
  check_api_error(cudaMemcpy(c_dev, c, sizeof(double) * n, cudaMemcpyHostToDevice));
  void * args[2] = { (void *)&c_dev, (void *)&n };
  check_api_error(cudaLaunchCooperativeKernel((void*)sum_array,
                                              n_thread_blocks,
                                              threads_per_block,
                                              args));
  check_api_error(cudaDeviceSynchronize());
  check_api_error(cudaMemcpy(c, c_dev, sizeof(double) * n, cudaMemcpyDeviceToHost));
  check_api_error(cudaFree(c_dev));
  printf("sum = %f\n", c[0]);
  assert(c[0] == n);
  return 0;
}

nvcc -o cuda_sum_ans cuda_sum_ans.cu
# nvc++ -Wall -o cuda_sum_ans cuda_sum_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_sum_ans cuda_sum_ans.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_sum_ans

%%writefile cuda_sched_rec.cu
#include <assert.h>
#include <stdio.h>

// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"

// record of execution
typedef struct {
  double x;                     // a (meaningless) answer 
  int sm[2];                     // SM on which a thread got started/ended
} record_t;

/* this thread repeats x = a x + b (N * M) times.
   it records the clock N times (every M iterations of x = a x + b)
   to array T.
   final result of x = a x + b, as well as SM each thread was executed
   on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
                                long * T, long n, long m,
                                int nthreads) {
  // my thread index
  int idx      = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx >= nthreads) return;
  // initial value (not important)
  double x = idx;
  // where clocks are recorded
  T = &T[idx * n];
  // record starting SM
  R[idx].sm[0] = get_smid();
  // main thing. repeat a x + b many times,
  // occasionally recording the clock
  for (long i = 0; i < n; i++) {
    T[i] = clock64();
    for (long j = 0; j < m; j++) {
      x = a * x + b;
    }
  }
  // record ending SM (must be = sm0)
  R[idx].sm[1] = get_smid();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[idx].x = x;
}

void dump(record_t * R, long * T, long nthreads, long M) {
  long t0 = LONG_MAX;
  long k = 0;
  assert(nthreads * M > 0);
  // find min clock
  for (long idx = 0; idx < nthreads; idx++) {
    for (long j = 0; j < M; j++) {
      t0 = (T[k] < t0 ? T[k] : t0);
      k++;
    }
  }
  assert(t0 < LONG_MAX);
  k = 0;
  for (long idx = 0; idx < nthreads; idx++) {
    printf("thread=%ld x=%f sm0=%u sm1=%u",
           idx, R[idx].x, R[idx].sm[0], R[idx].sm[1]);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}


/* usage
   ./cuda_sched N_THREAD_BLOCKS THREADS_PER_BLOCK N M S A B

   creates about N_THREAD_BLOCKS * THREADS_PER_BLOCK threads,
   with THREADS_PER_BLOCK threads in each thread block. 
   each thread repeats x = A x + B (N * M) times.

   shm_sz is the shared memory allocated for each thread block
   (just to control the number of thread blocks simultaneously
   scheduled on an SM). shared memory is not actually used at all.
 */
int main(int argc, char ** argv) {
  int i = 1;
  int n_thread_blocks   = (argc > i ? atoi(argv[i]) : 3);   i++;
  int threads_per_block = (argc > i ? atoi(argv[i]) : 64);  i++;
  long M             = (argc > i ? atoll(argv[i]) : 100);  i++;
  long N             = (argc > i ? atoll(argv[i]) : 100);  i++;
  int shm_sz          = (argc > i ? atoi(argv[i])  : 0);    i++;
  int D               = (argc > i ? atoll(argv[i]) : 1);    i++;
  double a            = (argc > i ? atof(argv[i])  : 0.99); i++;
  double b            = (argc > i ? atof(argv[i])  : 1.00); i++;

  printf("%d blocks * %d threads/block\n", n_thread_blocks, threads_per_block);
  int nthreads = n_thread_blocks * threads_per_block;

  // allocate record_t array (both on host and device)
  long R_sz = sizeof(record_t) * nthreads;
  record_t * R = (record_t *)calloc(R_sz, 1);
  record_t * R_dev;
  check_api_error(cudaMalloc(&R_dev, R_sz));
  check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));

  // allocate clock array (both on host and device)
  long T_sz = sizeof(long) * M * nthreads;
  long * T = (long *)calloc(T_sz, 1);
  long * T_dev;
  check_api_error(cudaMalloc(&T_dev, T_sz));
  check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));

  int shm_elems = shm_sz / sizeof(double);
  int shm_size = shm_elems * sizeof(double);

  // call the kernel
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,threads_per_block,shm_size>>>
                      (a, b, R_dev, T_dev, M, N, nthreads)));
  check_api_error(cudaDeviceSynchronize());

  // get back the results and clocks
  check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
  check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
  // dump the for visualization
  dump(R, T, nthreads, M);
  return 0;
}

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec cuda_sched_rec.cu

./cuda_sched_rec 2 32 10 100 | head -10

./cuda_sched_rec 1 1 100 1000 > cs_1_1.dat

import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_1_1.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

T=8
./cuda_sched_rec 1 ${T} 100 1000 > cs_1_T.dat

import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_1_T.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

B=3
T=64
./cuda_sched_rec ${B} ${T} 100 1000 > cs_B_T.dat

import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_B_T.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"), show_every=1)

nvcc --generate-code arch=compute_80,code=sm_80 -Xptxas -v -o cuda_sched_rec cuda_sched_rec.cu

B=3
S=$((8 * 1024))
./cuda_sched_rec ${B} 1 100 1000 ${S} > cs_B_1_S.dat

import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_B_1_S.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

%%writefile cuda_sched_rec_warp.cu
#include <assert.h>
#include <stdio.h>

// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"

// record of execution
typedef struct {
  double x;                     // a (meaningless) answer 
  int sm[2];                     // SM on which a thread got started/ended
} record_t;

/* this thread repeats x = a x + b (N * M) times.
   it records the clock N times (every M iterations of x = a x + b)
   to array T.
   final result of x = a x + b, as well as SM each thread was executed
   on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
                                long * T, long n, long m,
                                int D,
                                int nthreads) {
  // my thread index
  int idx      = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx >= nthreads) return;
  // initial value (not important)
  double x = idx;
  // where clocks are recorded
  T = &T[idx * n];
  // record starting SM
  R[idx].sm[0] = get_smid();
  // main thing. repeat a x + b many times,
  // occasionally recording the clock
  if ((idx / D) % 2 == 0) {
    for (long i = 0; i < n; i++) {
      T[i] = clock64();
      for (long j = 0; j < m; j++) {
        x = a * x + b;
      }
    }
  } else {
    for (long i = 0; i < n; i++) {
      T[i] = clock64();
      for (long j = 0; j < m / 2; j++) {
        x = a * x + b;
      }
    }
  }    
  // record ending SM (must be = sm0)
  R[idx].sm[1] = get_smid();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[idx].x = x;
}

void dump(record_t * R, long * T, long nthreads, long M) {
  long t0 = LONG_MAX;
  long k = 0;
  assert(nthreads * M > 0);
  // find min clock
  for (long idx = 0; idx < nthreads; idx++) {
    for (long j = 0; j < M; j++) {
      t0 = (T[k] < t0 ? T[k] : t0);
      k++;
    }
  }
  assert(t0 < LONG_MAX);
  k = 0;
  for (long idx = 0; idx < nthreads; idx++) {
    printf("thread=%ld x=%f sm0=%u sm1=%u",
           idx, R[idx].x, R[idx].sm[0], R[idx].sm[1]);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}


/* usage
   ./cuda_sched N_THREAD_BLOCKS THREADS_PER_BLOCK N M S A B

   creates about N_THREAD_BLOCKS * THREADS_PER_BLOCK threads,
   with THREADS_PER_BLOCK threads in each thread block. 
   each thread repeats x = A x + B (N * M) times.

   shm_sz is the shared memory allocated for each thread block
   (just to control the number of thread blocks simultaneously
   scheduled on an SM). shared memory is not actually used at all.
 */
int main(int argc, char ** argv) {
  int i = 1;
  int n_thread_blocks   = (argc > i ? atoi(argv[i]) : 3);   i++;
  int threads_per_block = (argc > i ? atoi(argv[i]) : 64);  i++;
  long M             = (argc > i ? atoll(argv[i]) : 100);  i++;
  long N             = (argc > i ? atoll(argv[i]) : 100);  i++;
  int shm_sz          = (argc > i ? atoi(argv[i])  : 0);    i++;
  int D               = (argc > i ? atoll(argv[i]) : 1);    i++;
  double a            = (argc > i ? atof(argv[i])  : 0.99); i++;
  double b            = (argc > i ? atof(argv[i])  : 1.00); i++;

  printf("%d blocks * %d threads/block\n", n_thread_blocks, threads_per_block);
  int nthreads = n_thread_blocks * threads_per_block;

  // allocate record_t array (both on host and device)
  long R_sz = sizeof(record_t) * nthreads;
  record_t * R = (record_t *)calloc(R_sz, 1);
  record_t * R_dev;
  check_api_error(cudaMalloc(&R_dev, R_sz));
  check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));

  // allocate clock array (both on host and device)
  long T_sz = sizeof(long) * M * nthreads;
  long * T = (long *)calloc(T_sz, 1);
  long * T_dev;
  check_api_error(cudaMalloc(&T_dev, T_sz));
  check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));

  int shm_elems = shm_sz / sizeof(double);
  int shm_size = shm_elems * sizeof(double);

  // call the kernel
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,threads_per_block,shm_size>>>
                      (a, b, R_dev, T_dev, M, N, D, nthreads)));
  check_api_error(cudaDeviceSynchronize());

  // get back the results and clocks
  check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
  check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
  // dump the for visualization
  dump(R, T, nthreads, M);
  return 0;
}

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec_warp cuda_sched_rec_warp.cu

B=4
T=32
D=1
./cuda_sched_rec_warp ${B} ${T} 100 1000 0 ${D} > cs_warp.dat

import cuda_sched_vis
cuda_sched_vis.cuda_sched_plt(["cs_warp.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

%%writefile cuda_integral.cu

// Write Your Code Here

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_integral cuda_integral.cu
# nvc++ -Wall -o cuda_integral cuda_integral.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_integral cuda_integral.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_integral

%%writefile cuda_integral_ans.cu
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <time.h>

#include "cuda_util.h"

double cur_time() {
  struct timespec tp[1];
  clock_gettime(CLOCK_REALTIME, tp);
  return tp->tv_sec + tp->tv_nsec * 1.0e-9;
}

__global__ void cuda_thread_fun(int n, double xa, double ya, double dx, double dy, double * sp) {
  int i          = blockDim.x * blockIdx.x + threadIdx.x;
  int j          = blockDim.y * blockIdx.y + threadIdx.y;
  if (i < n && j < n) {
    double x = xa + i * dx;
    double y = ya + j * dy;
    double z2 = 1 - x * x - y * y;
    if (z2 > 0) {
      atomicAdd(sp, sqrt(z2) * dx * dy);
    }
  }
}

int main(int argc, char ** argv) {
  double xa = 0.0;
  double xb = 1.0;
  double ya = 0.0;
  double yb = 1.0;
  int n = 10000;
  double dx = (xb - xa) / n;
  double dy = (yb - ya) / n;

  // thread configuration
  int nx                = n;
  int ny                = n;
  int thread_block_sz_x = (argc > 1 ? atoi(argv[1]) : 8);
  int thread_block_sz_y = thread_block_sz_x;
  int n_thread_blocks_x = (nx + thread_block_sz_x - 1) / thread_block_sz_x;
  int n_thread_blocks_y = (ny + thread_block_sz_y - 1) / thread_block_sz_y;

  double s = 0.0;
  double * s_dev;
  double t0 = cur_time();
  check_api_error(cudaMalloc(&s_dev, sizeof(double)));
  double t1 = cur_time();
  check_api_error(cudaMemcpy(s_dev, &s, sizeof(double), cudaMemcpyHostToDevice));
  double t2 = cur_time();
  
  dim3 nb(n_thread_blocks_x, n_thread_blocks_y);
  dim3 tpb(thread_block_sz_x, thread_block_sz_y);
  check_launch_error((cuda_thread_fun<<<nb,tpb>>>(n, xa, ya, dx, dy, s_dev)));
  check_api_error(cudaDeviceSynchronize());
  double t3 = cur_time();
  
  check_api_error(cudaMemcpy(&s, s_dev, sizeof(double), cudaMemcpyDeviceToHost));
  double t4 = cur_time();
  
  printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
  printf(" cudaMalloc  : %f sec\n", t1 - t0);
  printf(" host -> dev : %f sec\n", t2 - t1);
  printf(" kernel      : %f sec\n", t3 - t2);
  printf(" host <- dev : %f sec\n", t4 - t3);
  printf("---------------------------\n");
  printf("total        : %f sec\n", t4 - t0);
  return 0;
}

nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_integral_ans cuda_integral_ans.cu
# nvc++ -Wall -o cuda_integral_ans cuda_integral_ans.cu
# clang++ -Wall -Wno-unknown-cuda-version -o cuda_integral_ans cuda_integral_ans.cu -L/usr/local/cuda/lib64 -lcudart

./cuda_integral_ans

	registers	shared memory
Pascal	32 bit x 65536	64KB
Volta	32 bit x 65536	up to 96KB (*)
Ampere	32 bit x 65536	up to 163KB

CUDA Programming¶

1. CUDA¶

2. Compilers¶

2-1. Set up NVIDIA CUDA and HPC SDK¶

2-2. LLVM¶

3. Check host and GPU¶

4. Compiling and running CUDA programs¶

4-1. With nvcc (NVIDIA HPC SDK CUDA compiler)¶

4-2. With nvc++ (NVIDIA HPC SDK C++ compiler)¶

4-3. With clang++ (LLVM)¶

5. CUDA kernel¶

5-1. You'd better always check errors¶

6. The number of CUDA threads launched¶

Problem 1 : Change the number of threads per block¶

7. Thread ID¶

7-1. One-dimensional ID¶

7-2. Two- or three-dimensional ID¶

Problem 2 : Specify 2D thread blocks and grids¶

8. Passing data between host (CPU) and device (GPU)¶

8-1. cudaMalloc¶

8-2. cudaMemcpy¶

8-3. cudaFree¶

8-4. You cannot access malloc-allocated region on the device¶

8-5. You cannot access cudaMalloc-allocated region on the host¶

Problem 3 : Getting the result back from the device¶

9. Unified Memory¶

Problem 4 : Use Unified Memory¶

10. CUDA device memory model¶

11. Race condition and atomic operation¶

Problem 5 : Observe race condition¶

11-1. Atomic add¶

Problem 6 : Use atomicAdd¶

12. Barrier synchronization of threads¶

Problem 7 : Use barrier synchronization¶

13. Visualizing threads executing on the device¶

13-1. one thread ($1 \times 1$)¶

13-2. 1 thread block $\times$ $T$ threads/block ($1 \times T$)¶

13-4. many thread blocks $\times$ many threads/block ($B \times T$)¶

14. Thread blocks¶

15. Shared memory¶

16. Warp¶

Problem 8 : Putting them together: calculating an integral¶

Problem 6 : Use `atomicAdd`¶