kernel void
test_atomic_fn(global int *finalDest, volatile local int *destMemory,
               global int *srcMemory) {
  int tid = get_global_id(0), lid = get_local_id(0), gid = get_group_id(0);
  destMemory[lid] = srcMemory[tid];
  barrier(CLK_LOCAL_MEM_FENCE);
  atomic_sub(&destMemory[gid], srcMemory[tid]);
  barrier(CLK_LOCAL_MEM_FENCE);
  // Finally, write out the last value. Again, we're synced, so everyone will
  // be writing the same value.
  finalDest[gid] = destMemory[gid];
}
