export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH
export PATH=/usr/local/cuda/bin:$PATH

which nvc
which nvc++
which nvcc

export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH

which clang
which clang++

%%writefile omp_hello.c
#include <stdio.h>

int main() {
  printf("hello\n");
#pragma omp parallel
  printf("world\n");
  printf("good bye\n");
  return 0;
}

clang -fopenmp omp_hello.c -o omp_hello_clang

nvc -mp omp_hello.c -o omp_hello_nvc

OMP_NUM_THREADS=3 ./omp_hello_clang

OMP_NUM_THREADS=3 ./omp_hello_nvc

OMP_NUM_THREADS=3 ./omp_hello_clang

OMP_NUM_THREADS=3 ./omp_hello_nvc

%%writefile omp_hello.c
#include <stdio.h>

int main() {
  printf("hello\n");
#pragma omp parallel
  printf("world\n");
  printf("good bye\n");
  return 0;
}

%%writefile omp_hello_ans.c
#include <stdio.h>

int main() {
  printf("hello\n");
#pragma omp parallel
  {
    printf("world\n");
    printf("good bye\n");
  }
  return 0;
}

clang -fopenmp omp_hello_ans.c -o omp_hello_ans
# nvc -mp omp_hello_ans.c -o omp_hello_ans

OMP_NUM_THREADS=3 ./omp_hello_ans

clang -fopenmp omp_hello.c -o omp_hello
# nvc -mp omp_hello.c -o omp_hello

OMP_NUM_THREADS=3 ./omp_hello

%%writefile omp_hello_id.c
#include <stdio.h>

int main() {
  printf("hello\n");
#pragma omp parallel
  {
    printf("world\n");
    printf("good bye\n");
  }
  return 0;
}

clang -fopenmp omp_hello_id.c -o omp_hello_id
# nvc -mp omp_hello_id.c -o omp_hello_id

OMP_NUM_THREADS=3 ./omp_hello_id

%%writefile omp_hello_id_ans.c
#include <stdio.h>
#include <omp.h>

int main() {
  printf("hello\n");
#pragma omp parallel
  {
    int omp_nthreads = omp_get_num_threads();
    int omp_rank = omp_get_thread_num();
    printf("world %d/%d\n", omp_rank, omp_nthreads);
  }
  printf("good bye\n");
  return 0;
}

clang -fopenmp omp_hello_id_ans.c -o omp_hello_id_ans
# nvc -mp omp_hello_id_ans.c -o omp_hello_id_ans

OMP_NUM_THREADS=3 ./omp_hello_id_ans

%%writefile omp_for.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
#pragma omp parallel
  {
    printf("I am thread %d in a team of %d threads\n",
           omp_get_thread_num(), omp_get_num_threads());
#pragma omp for
    for (int i = 0; i < 24; i++) {
      usleep(100 * 1000 * i);
      printf("iteration %d executed by thread %d\n", i, omp_get_thread_num());
      fflush(stdout);
    }
  }
  return 0;
}

clang -fopenmp omp_for.c -o omp_for
# nvc -mp omp_for.c -o omp_for

OMP_NUM_THREADS=4 ./omp_for

%%writefile omp_parallel_for.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel for
  for (int i = 0; i < 24; i++) {
    usleep(100 * 1000 * i);     /* sleep 100 x i milliseconds */
    printf("iteration %d executed by thread %d\n", i, omp_get_thread_num());
    fflush(stdout);
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_parallel_for.c -o omp_parallel_for
# nvc -mp omp_parallel_for.c -o omp_parallel_for

OMP_NUM_THREADS=4 ./omp_parallel_for

%%writefile omp_sched_rec.c
#include <err.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <omp.h>

long cur_time_ns() {
  struct timespec ts[1];
  if (clock_gettime(CLOCK_REALTIME, ts) == -1) err(1, "clock_gettime");
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

typedef struct {
  double x;
  int thread[2];
  int cpu[2];
} record_t;

/* the function for an iteration
   perform
   x = a x + b
   (M * N) times and record current time
   every N iterations to T.
   record thread and cpu to R.
 */
void iter_fun(double a, double b, long i, long M, long N,
              record_t * R, long * T) {
  // initial value (not important)
  double x = i;
  // record in T[i * M] ... T[(i+1) * M - 1]
  T = &T[i * M];
  // record starting thread/cpu
  R[i].thread[0] = omp_get_thread_num();
  R[i].cpu[0] = sched_getcpu();
  // repeat a x + b many times.
  // record time every N iterations
  for (long j = 0; j < M; j++) {
    T[j] = cur_time_ns();
    for (long k = 0; k < N; k++) {
      x = a * x + b;
    }
  }
  // record ending SM (must be = thread0)
  R[i].thread[1] = omp_get_thread_num();
  R[i].cpu[1] = sched_getcpu();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[i].x = x;
}

void dump(record_t * R, long * T, long L, long M, long t0) {
  long k = 0;
  for (long i = 0; i < L; i++) {
    printf("i=%ld x=%f thread0=%d cpu0=%d thread1=%d cpu1=%d",
           i, R[i].x, R[i].thread[0], R[i].cpu[0], R[i].thread[1], R[i].cpu[1]);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k] - t0);
      k++;
    }
    printf("\n");
  }
}

int main(int argc, char ** argv) {
  int idx = 1;
  long L   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long M   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long N   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  double a = (idx < argc ? atof(argv[idx]) : 0.99); idx++;
  double b = (idx < argc ? atof(argv[idx]) : 1.00); idx++;
  record_t * R = (record_t *)calloc(L, sizeof(record_t));
  long * T = (long *)calloc(L * M, sizeof(long));
  long t0 = cur_time_ns();
#pragma omp parallel for
  for (long i = 0; i < L; i++) {
    iter_fun(a, b, i, M, N, R, T);
  }
  long t1 = cur_time_ns();
  printf("%ld nsec\n", t1 - t0);
  dump(R, T, L, M, t0);
  return 0;
}

clang -fopenmp -D_GNU_SOURCE omp_sched_rec.c -o omp_sched_rec
# nvc -mp omp_sched_rec.c -o omp_sched_rec

OMP_NUM_THREADS=4 ./omp_sched_rec > a.dat

import sched_vis
sched_vis.sched_plt(["a.dat"])
# sched_vis.sched_plt(["a.dat"], start_t=1.5e7, end_t=2.0e7)

nproc

%%writefile omp_schedule.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
  /* ----- add schedule clause below ----- */
#pragma omp parallel for
  for (int i = 0; i < 12; i++) {
    usleep(100 * 1000 * i);     /* sleep 100 x i milliseconds */
    printf("iteration %d executed by thread %d\n", i, omp_get_thread_num());
    fflush(stdout);
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_schedule.c -o omp_schedule
# nvc -mp omp_schedule.c -o omp_schedule

OMP_NUM_THREADS=4 ./omp_schedule

%%writefile omp_collapse.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
  /* apply collapse and schedule */
#pragma omp parallel for
  for (int i = 0; i < 5; i++) {
    for (int j = 0; j < 5; j++) {
      usleep(100 * 1000 * (i + j));
      printf("iteration (%d, %d) executed by thread %d\n", i, j, omp_get_thread_num());
      fflush(stdout);
    }
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_collapse.c -o omp_collapse
# nvc -mp omp_collapse.c -o omp_collapse

OMP_NUM_THREADS=3 ./omp_collapse

%%writefile omp_collapse_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
  /* apply collapse and schedule */
#pragma omp parallel for collapse(2) schedule(runtime)
  for (int i = 0; i < 5; i++) {
    for (int j = 0; j < 5; j++) {
      usleep(100 * 1000 * (i + j));
      printf("iteration (%d, %d) executed by thread %d\n", i, j, omp_get_thread_num());
      fflush(stdout);
    }
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_collapse_ans.c -o omp_collapse_ans
# nvc -mp omp_collapse_ans.c -o omp_collapse_ans

OMP_NUM_THREADS=3 ./omp_collapse_ans

%%writefile omp_master.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel
  {
#pragma omp master
    printf("inside the master pragma: I am thread %d of a team of %d threads\n",
           omp_get_thread_num(), omp_get_num_threads());
    printf("out of the master pragma: I am thread %d of a team of %d threads\n",
           omp_get_thread_num(), omp_get_num_threads());
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_master.c -o omp_master
# nvc -mp omp_master.c -o omp_master

OMP_NUM_THREADS=3 ./omp_master

%%writefile omp_parallel_master.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel master
  printf("I am thread %d of a team of %d threads\n",
         omp_get_thread_num(), omp_get_num_threads());
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_parallel_master.c -o omp_parallel_master
# NVIDIA compiler does not support this program
# nvc -mp omp_parallel_master.c -o omp_parallel_master

OMP_NUM_THREADS=3 ./omp_parallel_master

%%writefile omp_task.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel
#pragma omp master
  {
    printf("I am thread %d of a team of %d threads\n",
           omp_get_thread_num(), omp_get_num_threads());
#pragma omp task
    {
      printf("task A executed by %d of %d\n", omp_get_thread_num(), omp_get_num_threads());
      usleep(500 * 1000);
    }
#pragma omp task
    {
      printf("task B executed by %d of %d\n", omp_get_thread_num(), omp_get_num_threads());
      usleep(1000 * 1000);
    }
#pragma omp taskwait
    printf("two tasks done, executed by %d of %d\n", omp_get_thread_num(), omp_get_num_threads());
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_task.c -o omp_task
# nvc -mp omp_task.c -o omp_task

OMP_NUM_THREADS=3 ./omp_task

%%writefile omp_rec_task.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

void recursive_tasks(int n, int tid) {
  printf("task %d by %d of %d\n",
         tid, omp_get_thread_num(), omp_get_num_threads());
  fflush(stdout);
  if (n == 0) {
    usleep(300 * 1000);
  } else {
#pragma omp task
    recursive_tasks(n - 1, 2 * tid + 1);
#pragma omp task
    recursive_tasks(n - 1, 2 * tid + 2);
#pragma omp taskwait
  }
}
int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel
#pragma omp master
  {
    recursive_tasks(5, 0);
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_rec_task.c -o omp_rec_task
# nvc -mp omp_rec_task.c -o omp_rec_task

OMP_NUM_THREADS=10 ./omp_rec_task

%%writefile omp_taskloop.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  double t0 = omp_get_wtime();
#pragma omp parallel
#pragma omp master
#pragma omp taskloop
  for (int i = 0; i < 5; i++) {
    printf("i = %d starts\n", i);
    fflush(stdout);
#pragma omp taskloop
    for (int j = 0; j < 5; j++) {
      usleep(100 * 1000 * (i + j));
      printf("iteration (%d, %d) executed by thread %d\n", i, j, omp_get_thread_num());
      fflush(stdout);
    }
  }
  double t1 = omp_get_wtime();
  printf("%f sec\n", t1 - t0);
  return 0;
}

clang -fopenmp omp_taskloop.c -o omp_taskloop
# nvc -mp omp_taskloop.c -o omp_taskloop

OMP_NUM_THREADS=3 ./omp_taskloop

%%writefile omp_private.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
  /* add private(x) clause below and see the difference */
#pragma omp parallel
  {
    int id = omp_get_thread_num();
    printf("thread %d : x = %d\n", id, x);
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_private.c -o omp_private
# nvc -mp omp_private.c -o omp_private

OMP_NUM_THREADS=10 ./omp_private

%%writefile omp_private_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
  /* add private(x) clause below and see the difference */
#pragma omp parallel private(x)
  {
    int id = omp_get_thread_num();
    printf("thread %d : x = %d\n", id, x);
  }
  printf("after : x = %d\n", x);
  return 0;
}

%%writefile omp_firstprivate.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
  /* add private(x)/firstprivate(x) clause and see the difference */
#pragma omp parallel
  {
    int id = omp_get_thread_num();
    x++;
    printf("thread %d : x = %d\n", id, x);
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_firstprivate.c -o omp_firstprivate
# nvc -mp omp_firstprivate.c -o omp_firstprivate

OMP_NUM_THREADS=3 ./omp_firstprivate

%%writefile omp_firstprivate_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
  /* add private(x)/firstprivate(x) clause and see the difference */
#pragma omp parallel private(x)
  {
    int id = omp_get_thread_num();
    x++;
    printf("thread %d : x = %d\n", id, x);
  }
  printf("after : x = %d\n", x);
  return 0;
}

%%writefile omp_firstprivate_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
  /* add private(x)/firstprivate(x) clause and see the difference */
#pragma omp parallel firstprivate(x)
  {
    int id = omp_get_thread_num();
    x++;
    printf("thread %d : x = %d\n", id, x);
  }
  printf("after : x = %d\n", x);
  return 0;
}

%%writefile omp_critical.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel
  {
    int id = omp_get_thread_num();
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_critical.c -o omp_critical
# nvc -mp omp_critical.c -o omp_critical

OMP_NUM_THREADS=100 ./omp_critical

%%writefile omp_critical_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel
  {
    int id = omp_get_thread_num();
#pragma omp critical
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_critical_ans.c -o omp_critical_ans
# nvc -mp omp_critical_ans.c -o omp_critical_ans

OMP_NUM_THREADS=100 ./omp_critical_ans

%%writefile omp_atomic.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel
  {
    int id = omp_get_thread_num();
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_atomic.c -o omp_atomic
# nvc -mp omp_atomic.c -o omp_atomic

OMP_NUM_THREADS=100 ./omp_atomic

%%writefile omp_atomic_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel
  {
    int id = omp_get_thread_num();
#pragma omp atomic
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_atomic_ans.c -o omp_atomic_ans
# nvc -mp omp_atomic_ans.c -o omp_atomic_ans

OMP_NUM_THREADS=100 ./omp_atomic_ans

%%writefile omp_reduction.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel
  {
    int id = omp_get_thread_num();
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_reduction.c -o omp_reduction
# nvc -mp omp_reduction.c -o omp_reduction

OMP_NUM_THREADS=100 ./omp_reduction

%%writefile omp_reduction_ans.c
#include <stdio.h>
#include <unistd.h>
#include <omp.h>

int main() {
  int x = 123;
  printf("before : x = %d\n", x);
#pragma omp parallel reduction(+:x)
  {
    int id = omp_get_thread_num();
    x++;
  }
  printf("after : x = %d\n", x);
  return 0;
}

clang -fopenmp omp_reduction_ans.c -o omp_reduction_ans
# nvc -mp omp_reduction_ans.c -o omp_reduction_ans

OMP_NUM_THREADS=10 ./omp_reduction_ans

%%writefile omp_ud_reduction.c
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <omp.h>

/* 3-element vector */
typedef struct {
  double a[3];
} vec_t;

/* x += y */
void vec_add(vec_t * x, vec_t * y) {
  for (int i = 0; i < 3; i++) {
    x->a[i] += y->a[i];
  }
}

/* x = {0,0,0} */
void vec_init(vec_t * x) {
  for (int i = 0; i < 3; i++) {
    x->a[i] = 0;
  }
}


/* add an appropriate #pragma omp declare reduction ... here */
  
int main() {
  vec_t v;
  vec_init(&v);
  double t0 = omp_get_wtime();
  /* add an appropriate reduction clause, so that
     the result is always {10000,10000,10000} */
#pragma omp parallel for
  for (int i = 0; i < 30000; i++) {
    v.a[i % 3]++;
  }
  double t1 = omp_get_wtime();
  printf("ans = {%.1f, %.1f, %.1f} in %f sec\n", v.a[0], v.a[1], v.a[2], t1 - t0);
  return 0;
}

clang -fopenmp omp_ud_reduction.c -o omp_ud_reduction
# nvc -mp omp_ud_reduction.c -o omp_ud_reduction

OMP_NUM_THREADS=10 ./omp_ud_reduction

%%writefile omp_ud_reduction_ans.c
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <omp.h>

/* 3-element vector */
typedef struct {
  double a[3];
} vec_t;

/* x += y */
void vec_add(vec_t * x, vec_t * y) {
  for (int i = 0; i < 3; i++) {
    x->a[i] += y->a[i];
  }
}

/* x = {0,0,0} */
void vec_init(vec_t * x) {
  for (int i = 0; i < 3; i++) {
    x->a[i] = 0;
  }
}

#pragma omp declare reduction                   \
  (vp : vec_t : vec_add(&omp_out,&omp_in))      \
  initializer(vec_init(&omp_priv))

/* add an appropriate #pragma omp declare reduction ... here */
  
int main() {
  vec_t v;
  vec_init(&v);
  double t0 = omp_get_wtime();
  /* add an appropriate reduction clause, so that
     the result is always {10000,10000,10000} */
#pragma omp parallel for reduction(vp:v)
  for (int i = 0; i < 30000; i++) {
    v.a[i % 3]++;
  }
  double t1 = omp_get_wtime();
  printf("ans = {%.1f, %.1f, %.1f} in %f sec\n", v.a[0], v.a[1], v.a[2], t1 - t0);
  return 0;
}

clang -fopenmp omp_ud_reduction_ans.c -o omp_ud_reduction_ans
# nvc -mp omp_ud_reduction_ans.c -o omp_ud_reduction_ans

OMP_NUM_THREADS=10 ./omp_ud_reduction_ans

%%writefile omp_integral.c

%%writefile omp_integral_ans.c
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>


long cur_time() {
  struct timespec ts[1];
  clock_gettime(CLOCK_REALTIME, ts);
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

typedef struct {
  long t;
  int thread;
} record_t;

enum { A = 32 };


/* parallel for outerloop */
double int_sqrt_one_minus_x2_y2(long n, record_t R[(n+A-1)/A][(n+A-1)/A]) {
  (void)R;
  double h = 1.0 / n;
  double s = 0.0;
#pragma omp parallel for reduction(+:s) schedule(runtime)
  for (long i = 0; i < n; i++) {
    for (long j = 0; j < n; j++) { 
      double x = i * h ;
      double y = j * h;
      double z = 1 - x * x - y * y;
      if (z > 0.0) {
        s += sqrt(z);
      } else {
        break;
      }
    }
  }
  return s * h * h;
}

int main(int argc, char ** argv) {
  int i = 1;
  long n      = (argc > i ? atof(argv[i]) : 30L * 1000L); i++;
  printf("n = %ld (%ld points to evaluate integrand on)\n", n, n * n);
  record_t (*R)[] = (record_t (*)[])0;
  long t0 = cur_time();
  double s = int_sqrt_one_minus_x2_y2(n, R);
  long t1 = cur_time();
  long dt = t1 - t0;
  printf("%.3f sec\n", dt * 1.0e-9);
  printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
  return 0;
}

%%writefile omp_integral_ans.c
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>


long cur_time() {
  struct timespec ts[1];
  clock_gettime(CLOCK_REALTIME, ts);
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

typedef struct {
  long t;
  int thread;
} record_t;

enum { A = 32 };


/* parallel for both loops */
double int_sqrt_one_minus_x2_y2(long n, record_t R[(n+A-1)/A][(n+A-1)/A]) {
  (void)R;
  double h = 1.0 / n;
  double s = 0.0;
#pragma omp parallel for collapse(2) reduction(+:s) schedule(runtime)
  for (long i = 0; i < n; i++) {
    for (long j = 0; j < n; j++) {
      double x = i * h ;
      double y = j * h;
      double z = 1 - x * x - y * y;
      if (z > 0.0) {
        s += sqrt(z);
      }
    }
  }
  return s * h * h;
}

int main(int argc, char ** argv) {
  int i = 1;
  long n      = (argc > i ? atof(argv[i]) : 30L * 1000L); i++;
  printf("n = %ld (%ld points to evaluate integrand on)\n", n, n * n);
  record_t (*R)[] = (record_t (*)[])0;
  long t0 = cur_time();
  double s = int_sqrt_one_minus_x2_y2(n, R);
  long t1 = cur_time();
  long dt = t1 - t0;
  printf("%.3f sec\n", dt * 1.0e-9);
  printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
  return 0;
}

%%writefile omp_integral_ans.c
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>


long cur_time() {
  struct timespec ts[1];
  clock_gettime(CLOCK_REALTIME, ts);
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

typedef struct {
  long t;
  int thread;
} record_t;

enum { A = 32 };


/* taskloops */
double int_sqrt_one_minus_x2_y2(long n, record_t R[(n+A-1)/A][(n+A-1)/A]) {
  (void)R;
  double h = 1.0 / n;
  double s = 0.0;
#pragma omp parallel
#pragma omp master
#pragma omp taskloop collapse(2) reduction(+:s)
  for (long i = 0; i < n; i++) {
    for (long j = 0; j < n; j++) {
      double x = i * h ;
      double y = j * h;
      double z = 1 - x * x - y * y;
      if (z > 0.0) {
        s += sqrt(z);
      }
    }
  }
  return s * h * h;
}

int main(int argc, char ** argv) {
  int i = 1;
  long n      = (argc > i ? atof(argv[i]) : 30L * 1000L); i++;
  printf("n = %ld (%ld points to evaluate integrand on)\n", n, n * n);
  record_t (*R)[] = (record_t (*)[])0;
  long t0 = cur_time();
  double s = int_sqrt_one_minus_x2_y2(n, R);
  long t1 = cur_time();
  long dt = t1 - t0;
  printf("%.3f sec\n", dt * 1.0e-9);
  printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
  return 0;
}

%%writefile omp_integral_ans.c
#include <err.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>


long cur_time() {
  struct timespec ts[1];
  clock_gettime(CLOCK_REALTIME, ts);
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

typedef struct {
  long t;
  int thread;
} record_t;

enum { A = 32 };


/* task */
typedef struct {
  long x0;
  long y0;
  long dx;
  long dy;
} reg_t;
enum { threshold = 10000 };

double int_sqrt_one_minus_x2_y2_rec(long n, reg_t r,
                                    record_t R[(n+A-1)/A][(n+A-1)/A]) {
  (void)R;
  if (r.dx * r.dy < threshold) {
    double h = 1.0 / n;
    double s = 0.0;
    for (long i = r.x0; i < r.x0 + r.dx; i++) {
      for (long j = r.y0; j < r.y0 + r.dy; j++) {
        double x = i * h ;
        double y = j * h;
        double z = 1 - x * x - y * y;
        if (z > 0.0) {
          s += sqrt(z);
        }
      }
    }
    return s * h * h;
  } else if (r.dy < r.dx) {
    long dx = r.dx;
    reg_t r0 = { r.x0,          r.y0, dx / 2,          r.dy };
    reg_t r1 = { r.x0 + dx / 2, r.y0, r.dx - dx / 2, r.dy };
    double s0 = 0.0;
    double s1 = 0.0;
#pragma omp task shared(s0)
    s0 = int_sqrt_one_minus_x2_y2_rec(n, r0, R);
#pragma omp task shared(s1)
    s1 = int_sqrt_one_minus_x2_y2_rec(n, r1, R);
#pragma omp taskwait
    return s0 + s1;
  } else {
    long dy = r.dy;
    reg_t r0 = { r.x0, r.y0,          r.dx, dy / 2 };
    reg_t r1 = { r.x0, r.y0 + dy / 2, r.dx, r.dy - dy / 2 };
    double s0 = 0.0;
    double s1 = 0.0;
#pragma omp task shared(s0)
    s0 = int_sqrt_one_minus_x2_y2_rec(n, r0, R);
#pragma omp task shared(s1)
    s1 = int_sqrt_one_minus_x2_y2_rec(n, r1, R);
#pragma omp taskwait
    return s0 + s1;
  }
}

double int_sqrt_one_minus_x2_y2(long n, record_t R[(n+A-1)/A][(n+A-1)/A]) {
  reg_t r = { 0, 0, n, n };
  double s = 0.0;
#pragma omp parallel
#pragma omp master
  s = int_sqrt_one_minus_x2_y2_rec(n, r, R);
  return s;
}

int main(int argc, char ** argv) {
  int i = 1;
  long n      = (argc > i ? atof(argv[i]) : 30L * 1000L); i++;
  printf("n = %ld (%ld points to evaluate integrand on)\n", n, n * n);
  record_t (*R)[] = (record_t (*)[])0;
  long t0 = cur_time();
  double s = int_sqrt_one_minus_x2_y2(n, R);
  long t1 = cur_time();
  long dt = t1 - t0;
  printf("%.3f sec\n", dt * 1.0e-9);
  printf("s = %.9f (err = %e)\n", s, fabs(s - M_PI/6));
  return 0;
}

clang -O3 -fopenmp omp_integral_ans.c -o omp_integral_ans -lm
# nvc -O4 -mp omp_integral_ans.c -o omp_integral_ans

OMP_NUM_THREADS=8 ./omp_integral_ans

OpenMP Programming Tutorial and Hands-on¶

1. OpenMP¶

2. Compilers¶

2-1. Set up NVIDIA CUDA and HPC SDK¶

2-2. Set up LLVM¶

3. Compiling and running OpenMP programs¶

Problem 1 : Change the number of threads¶

4. `#pragma omp parallel`¶

Problem 2 : Executing multiple statements by threads¶

5. `omp_get_num_threads()` and `omp_get_thread_num()`¶

Problem 3 : Using `omp_get_num_threads()` and `omp_get_thread_num()`¶

6. `#pragma omp for`¶

Problem 4 : How does `#pragma omp for` divide iterations to threads?¶

6-1. for loops allowed by `#pragma omp for`¶

6-2. Combined pragma (parallel + for)¶

7-1. Visualizing scheduling¶

Problem 5 : Understanding scheduling by visualization¶

Problem 6 : Specifying the scheduling policy by schedule clause¶

8. Collapse clause¶

Problem 7 : Apply collapse and schedule¶

9. Task parallelism¶

Problem 8 : A quiz about recursive tasks¶

10. Taskloop¶

Problem 9 : Observe the effect of privatization¶

Problem 10 : Observe the effect of `private` and `firstprivate`¶

12. Race condition¶

13. `#pragma omp critical`¶

Problem 11 : Apply `#pragma omp critical`¶

14. `#pragma omp atomic`¶

Problem 12 : Apply `#pragma omp atomic`¶

15. Reduction clause¶

Problem 13 : Apply reduction¶

16. How reduction clause works and why it is preferable when applicable¶

17. User-defined reduction¶

17-1. Apply a user-defined reduction¶

Problem 14 : Putting them together: calculating an integral¶

OpenMP Programming Tutorial and Hands-on¶

1. OpenMP¶

2. Compilers¶

2-1. Set up NVIDIA CUDA and HPC SDK¶

2-2. Set up LLVM¶

3. Compiling and running OpenMP programs¶

Problem 1 : Change the number of threads¶

4. #pragma omp parallel¶

Problem 2 : Executing multiple statements by threads¶

5. omp_get_num_threads() and omp_get_thread_num()¶

Problem 3 : Using omp_get_num_threads() and omp_get_thread_num()¶

6. #pragma omp for¶

Problem 4 : How does #pragma omp for divide iterations to threads?¶

6-1. for loops allowed by #pragma omp for¶

6-2. Combined pragma (parallel + for)¶

7. Scheduling a work-sharing for loop¶

7-1. Visualizing scheduling¶

Problem 5 : Understanding scheduling by visualization¶

Problem 6 : Specifying the scheduling policy by schedule clause¶

8. Collapse clause¶

Problem 7 : Apply collapse and schedule¶

9. Task parallelism¶

Problem 8 : A quiz about recursive tasks¶

10. Taskloop¶

11. Data sharing¶

Problem 9 : Observe the effect of privatization¶

Problem 10 : Observe the effect of private and firstprivate¶

12. Race condition¶

13. #pragma omp critical¶

Problem 11 : Apply #pragma omp critical¶

14. #pragma omp atomic¶

Problem 12 : Apply #pragma omp atomic¶

15. Reduction clause¶

Problem 13 : Apply reduction¶

16. How reduction clause works and why it is preferable when applicable¶

17. User-defined reduction¶

17-1. Apply a user-defined reduction¶

Problem 14 : Putting them together: calculating an integral¶

4. `#pragma omp parallel`¶

5. `omp_get_num_threads()` and `omp_get_thread_num()`¶

Problem 3 : Using `omp_get_num_threads()` and `omp_get_thread_num()`¶

6. `#pragma omp for`¶

Problem 4 : How does `#pragma omp for` divide iterations to threads?¶

6-1. for loops allowed by `#pragma omp for`¶

Problem 10 : Observe the effect of `private` and `firstprivate`¶

13. `#pragma omp critical`¶

Problem 11 : Apply `#pragma omp critical`¶

14. `#pragma omp atomic`¶

Problem 12 : Apply `#pragma omp atomic`¶