#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

#define MAX_SIZE 64

kernel void test_atomic_fn(volatile global int *destMemory,
  global int *oldValues, global int *sum) {
  local int L[MAX_SIZE];
  int lid = get_local_id(0), tid = get_global_id(0), gid = get_group_id(0);

  // Early exit for out-of-bound WIs.
  if (lid >= MAX_SIZE)
    return;

  L[lid] = atomic_add(&destMemory[gid], oldValues[tid]);
  barrier(CLK_LOCAL_MEM_FENCE);

  // Only one WI does the aggregation.
  if (lid)
    return;

  sum = 0;
  int NumIters = min(MAX_SIZE, (int)get_local_size(0));
  for (int i=0; i<NumIters; i++)
    sum += L[i];
}
