Add examples for CUDA (#921) (#3439)

- Add examples for CUDA
- Update CONTRIBUTORS.md
This commit is contained in:
Arsenic
2022-03-14 20:51:13 +05:30
committed by GitHub
parent 024788b084
commit 9a75a7a1d6
4 changed files with 77 additions and 0 deletions

View File

@@ -113,3 +113,4 @@ From oldest to newest contributor, we would like to thank:
- [Marcus Geelnard](https://github.com/mbitsnbites)
- [Haneef Mubarak](https://github.com/haneefmubarak)
- [Jeremy Rifkin](https://github.com/jeremy-rifkin)
- [Ankur Saini](https://github.com/Arsenic-ATG)

View File

@@ -0,0 +1,9 @@
__global__ void elementwise_add(const int * array1,
const int * array2, int * result, int size) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
while (idx < size) {
result[idx] = array1[idx] + array2[idx];
idx += stride;
}
}

View File

@@ -0,0 +1,36 @@
#define MAX(x, y)((x > y) ? x : y)
__global__ void find_max(const int * array, int * max, int * mutex, unsigned int n) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
unsigned int input_idx = idx;
__shared__ int partial_res[256];
int local_max = array[0];
while (input_idx < n) {
local_max = MAX(local_max, array[input_idx]);
input_idx += stride;
}
partial_res[threadIdx.x] = local_max;
__syncthreads();
// reduction
unsigned int i = blockDim.x / 2;
while (i != 0) {
if (threadIdx.x < i) {
partial_res[threadIdx.x] = MAX(partial_res[threadIdx.x], partial_res[threadIdx.x + i]);
}
__syncthreads();
i /= 2;
}
if (threadIdx.x == 0) {
while (atomicCAS(mutex, 0, 1) != 0);
* max = MAX( * max, partial_res[0]);
atomicExch(mutex, 0);
}
}

View File

@@ -0,0 +1,31 @@
__global__ void sum_array(const int * array, int * total, unsigned int n) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
unsigned int input_idx = idx;
__shared__ int partial_res[256];
int partial_sum = 0;
while (input_idx < n) {
partial_sum += array[input_idx];
input_idx += stride;
}
partial_res[threadIdx.x] = partial_sum;
__syncthreads();
// reduction
unsigned int i = blockDim.x / 2;
while (i != 0) {
if (threadIdx.x < i) {
partial_res[threadIdx.x] += partial_res[threadIdx.x + i];
}
__syncthreads();
i /= 2;
}
if (threadIdx.x == 0) {
atomicAdd(total, partial_res[0]);
}
}