#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

kernel void test_atomic_fn(volatile global uint *G, local uint *L, global uint *S) {
  int  lid = get_local_id(0), tid = get_global_id(0), gid = get_group_id(0);
  uint oldValue, newValue;
  int i;

  if (gid)
    return;

  // Copy from global to local.
  L[lid] = G[gid];
  barrier(CLK_LOCAL_MEM_FENCE);

  do {
    oldValue = L[0];
    newValue = oldValue + 1;
    oldValue = atomic_cmpxchg(&L[0], oldValue, newValue);
  } while (oldValue == L[0]);

  // Assign the old value to the local buffer.
  L[lid] = oldValue;
  barrier(CLK_LOCAL_MEM_FENCE);

  // Only one WI continues.
  if (lid)
    return;

  *S = 0;
  // Aggregate all the values.
  for (i = 0; i < get_local_size(0); i++)
    *S += L[i];
}
