#define MAX_LOCAL_SIZE 64
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
kernel void test_atomic_fn(volatile global uint *destMemory, global uint *Sum) {

  int  tid = get_local_id(0), gid = get_group_id(0);
  int oldValue, newValue;
  local uint localValues[MAX_LOCAL_SIZE];

  if (gid)
    return;

  do {
    oldValue = destMemory[gid];
    newValue = oldValue + 1;
    oldValue = atomic_cmpxchg(&destMemory[gid], oldValue, newValue);
  } while (oldValue == destMemory[gid]);

  if (tid < MAX_LOCAL_SIZE)
    localValues[tid] = oldValue;

  // Only one WI continues.
  barrier(CLK_LOCAL_MEM_FENCE);

  // Only one WI does the aggregation.
  if (tid)
    return;

  // Aggregate all the values.
  *Sum = 0;
  int numIters = min((int)get_local_size(0), MAX_LOCAL_SIZE);
  for (int i=0; i<numIters ; i++)
    *Sum += localValues[i];
}
