diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f7f12fb50..12453aa79 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -113,3 +113,4 @@ From oldest to newest contributor, we would like to thank: - [Marcus Geelnard](https://github.com/mbitsnbites) - [Haneef Mubarak](https://github.com/haneefmubarak) - [Jeremy Rifkin](https://github.com/jeremy-rifkin) +- [Ankur Saini](https://github.com/Arsenic-ATG) diff --git a/examples/cuda/Add_two_arrays.cu b/examples/cuda/Add_two_arrays.cu new file mode 100644 index 000000000..d8c9c225c --- /dev/null +++ b/examples/cuda/Add_two_arrays.cu @@ -0,0 +1,9 @@ +__global__ void elementwise_add(const int * array1, + const int * array2, int * result, int size) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + while (idx < size) { + result[idx] = array1[idx] + array2[idx]; + idx += stride; + } +} diff --git a/examples/cuda/Max_array.cu b/examples/cuda/Max_array.cu new file mode 100644 index 000000000..08e353a14 --- /dev/null +++ b/examples/cuda/Max_array.cu @@ -0,0 +1,36 @@ +#define MAX(x, y)((x > y) ? x : y) + +__global__ void find_max(const int * array, int * max, int * mutex, unsigned int n) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + unsigned int input_idx = idx; + + __shared__ int partial_res[256]; + + int local_max = array[0]; + while (input_idx < n) { + local_max = MAX(local_max, array[input_idx]); + input_idx += stride; + } + + partial_res[threadIdx.x] = local_max; + + __syncthreads(); + + // reduction + unsigned int i = blockDim.x / 2; + while (i != 0) { + if (threadIdx.x < i) { + partial_res[threadIdx.x] = MAX(partial_res[threadIdx.x], partial_res[threadIdx.x + i]); + } + + __syncthreads(); + i /= 2; + } + + if (threadIdx.x == 0) { + while (atomicCAS(mutex, 0, 1) != 0); + * max = MAX( * max, partial_res[0]); + atomicExch(mutex, 0); + } +} diff --git a/examples/cuda/Sum_over_array.cu b/examples/cuda/Sum_over_array.cu new file mode 100644 index 000000000..0fe802b8e --- /dev/null +++ b/examples/cuda/Sum_over_array.cu @@ -0,0 +1,31 @@ +__global__ void sum_array(const int * array, int * total, unsigned int n) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int stride = gridDim.x * blockDim.x; + unsigned int input_idx = idx; + + __shared__ int partial_res[256]; + + int partial_sum = 0; + while (input_idx < n) { + partial_sum += array[input_idx]; + input_idx += stride; + } + + partial_res[threadIdx.x] = partial_sum; + + __syncthreads(); + + // reduction + unsigned int i = blockDim.x / 2; + while (i != 0) { + if (threadIdx.x < i) { + partial_res[threadIdx.x] += partial_res[threadIdx.x + i]; + } + __syncthreads(); + i /= 2; + } + + if (threadIdx.x == 0) { + atomicAdd(total, partial_res[0]); + } +}