__kernel void test( __global double *p, __global half *f )
{
   __local ushort4 data[64+1];
   size_t i = get_global_id(0);
   size_t lid = get_local_id(0);
   size_t lsize = get_local_size(0);
   event_t async_event;

   // Clean local mem first
   data[lid] = (ushort4)(0);

   vstore_half3_rtp( vload3(i,p), lid, (__local half *)data );
   barrier( CLK_LOCAL_MEM_FENCE ); 

   async_event = async_work_group_copy((__global ushort *)(f+4*(i-lid)), (__local ushort*)data, lsize*4, 0);
   wait_group_events(1, &async_event);
}

