diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f7f12fb50..12453aa79 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -113,3 +113,4 @@ From oldest to newest contributor, we would like to thank:
 - [Marcus Geelnard](https://github.com/mbitsnbites)
 - [Haneef Mubarak](https://github.com/haneefmubarak)
 - [Jeremy Rifkin](https://github.com/jeremy-rifkin)
+- [Ankur Saini](https://github.com/Arsenic-ATG)
diff --git a/examples/cuda/Add_two_arrays.cu b/examples/cuda/Add_two_arrays.cu
new file mode 100644
index 000000000..d8c9c225c
--- /dev/null
+++ b/examples/cuda/Add_two_arrays.cu
@@ -0,0 +1,9 @@
+__global__ void elementwise_add(const int * array1,
+    const int * array2, int * result, int size) {
+    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int stride = gridDim.x * blockDim.x;
+    while (idx < size) {
+        result[idx] = array1[idx] + array2[idx];
+        idx += stride;
+    }
+}
diff --git a/examples/cuda/Max_array.cu b/examples/cuda/Max_array.cu
new file mode 100644
index 000000000..08e353a14
--- /dev/null
+++ b/examples/cuda/Max_array.cu
@@ -0,0 +1,36 @@
+#define MAX(x, y)((x > y) ? x : y)
+
+__global__ void find_max(const int * array, int * max, int * mutex, unsigned int n) {
+    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int stride = gridDim.x * blockDim.x;
+    unsigned int input_idx = idx;
+
+    __shared__ int partial_res[256];
+
+    int local_max = array[0];
+    while (input_idx < n) {
+        local_max = MAX(local_max, array[input_idx]);
+        input_idx += stride;
+    }
+
+    partial_res[threadIdx.x] = local_max;
+
+    __syncthreads();
+
+    // reduction
+    unsigned int i = blockDim.x / 2;
+    while (i != 0) {
+        if (threadIdx.x < i) {
+            partial_res[threadIdx.x] = MAX(partial_res[threadIdx.x], partial_res[threadIdx.x + i]);
+        }
+
+        __syncthreads();
+        i /= 2;
+    }
+
+    if (threadIdx.x == 0) {
+        while (atomicCAS(mutex, 0, 1) != 0);
+        * max = MAX( * max, partial_res[0]);
+        atomicExch(mutex, 0);
+    }
+}
diff --git a/examples/cuda/Sum_over_array.cu b/examples/cuda/Sum_over_array.cu
new file mode 100644
index 000000000..0fe802b8e
--- /dev/null
+++ b/examples/cuda/Sum_over_array.cu
@@ -0,0 +1,31 @@
+__global__ void sum_array(const int * array, int * total, unsigned int n) {
+    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int stride = gridDim.x * blockDim.x;
+    unsigned int input_idx = idx;
+
+    __shared__ int partial_res[256];
+
+    int partial_sum = 0;
+    while (input_idx < n) {
+        partial_sum += array[input_idx];
+        input_idx += stride;
+    }
+
+    partial_res[threadIdx.x] = partial_sum;
+
+    __syncthreads();
+
+    // reduction
+    unsigned int i = blockDim.x / 2;
+    while (i != 0) {
+        if (threadIdx.x < i) {
+            partial_res[threadIdx.x] += partial_res[threadIdx.x + i];
+        }
+        __syncthreads();
+        i /= 2;
+    }
+
+    if (threadIdx.x == 0) {
+        atomicAdd(total, partial_res[0]);
+    }
+}