A Jump Start to OpenCL
Another Language to Program Parallel Computing Devices
March 15, 2009
CIS 565/665 – GPU Computing
and Architecture
Sources
• OpenCL Tutorial - Introduction to OpenCL
• OpenCL for NVIDIA GPUs – Chris Lamb
• OpenCL – Parallel Computing for
Heterogeneous Devices (SIGGASIA) –
Kronos Group
• NVIDIA OpenCL Jump Start Guide
• OpenCL – Making Use of What You’ve Got
• OpenCL Basics and Advanced (PPAM
2009) – Domink Behr
Sources
• OpenCL Tutorial - Introduction to OpenCL
• OpenCL for NVIDIA GPUs – Chris Lamb
• OpenCL – Parallel Computing for
Heterogeneous Devices (SIGGASIA) –
Kronos Group
• NVIDIA OpenCL Jump Start Guide
• OpenCL – Making Use of What You’ve Got
• OpenCL Basics and Advanced (PPAM
2009) – Domink Behr
CUDA Working Group
• Because of Nexus and
Visual Studio Integration….
Anatomy of OpenCL
• Language Specification
• C-based cross-platform programming interface
• Subset of ISO C99 with language extensions - familiar
to developers
• Well-defined numerical accuracy (IEEE 754 rounding
with specified max error)
• Online or offline compilation and build of compute
kernel executables
• Includes a rich set of built-in functions
• Platform Layer API
• A hardware abstraction layer over diverse
computational resources
• Query, select and initialize compute devices
• Create compute contexts and work-queues
• Runtime API
• Execute compute kernels
• Manage scheduling, compute, and memory resources
CUDA
Memory Model Comparison
OpenCL CUDA
CUDA vs OpenCL
Architecture – Execution Model
• Kernel – Smallest unit of execution, like a C
function
• Host program – A collection of kernels
• Work item, an instance of kernel at run time
• Work group, a collection of work items
Command Queues
CUDA vs OpenCL API
Differences
• Naming Schemes
• How data gets passes to the API
• C for CUDA programs are compiled
with an external tool (NVCC
compiler)
• OpenCL compiler it typically invoked
at runtime (you can offline compile
too)
CUDA OpenCL
cuInit(0); cl_context hContext;
cuDeviceGet(&hContext, 0); hContext = clCreateContextFromType(0,
cuCtxCreate(&hContext, 0, hDevice)); CL_DEVICE_DEVICE_TYPE_GPU, 0,0,0);
CUdeviceptr pDeviceMemA, pDeviceMemB, cl_mem hDeviceMemA, hDeviceMemB,
pDeviceMemC; hDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * hDeviceMemA = clCreateBuffer(hContext,
sizeof(float)); CL_MEM_READ_ONLY |
cuMemAlloc(&pDeviceMemB, cnDimension * CL_MEM_COPY_HOST_PTR,
sizeof(float)); cnDimension * sizeof(cl_float), pA, 0);
cuMemAlloc(&pDeviceMemC, cnDimension * hDeviceMemB = clCreateBuffer(hContext,
sizeof(float)); CL_MEM_READ_ONLY |
// copy host vectors to device CL_MEM_COPY_HOST_PTR,
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension cnDimension * sizeof(cl_float), pA, 0);
* sizeof(float)); hDeviceMemC = clCreateBuffer(hContext,
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension CL_MEM_WRITE_ONLY,
* sizeof(float)); cnDimension * sizeof(cl_float) 0, 0);
cuFuncSetBlockShape(cuFunction, cnBlockSize, clEnqueueNDRangeKernel(hCmdQueue,
1, 1); hKernel, 1, 0,
cuLaunchGrid (cuFunction, cnBlocks, 1); &cnDimension, &cnBlockSize, 0, 0, 0);
CUDA Pointer Traversal
struct Node { Node* next; }
n = n->next; // undefined operation in OpenCL,
// since ‘n’ here is a kernel input
OpenCL Pointer Traversal
struct Node { unsigned int next; }
…
n = bufBase + n; // pointer arithmetic is fine, bufBase is
// a kernel input param to the buffer’s beginning
CUDA Kernel code:
__global__ void
vectorAdd(const float * a, const float * b, float * c)
{
// Vector element index
int nIndex = blockIdx.x * blockDim.x + threadIdx.x;
c[nIndex] = a[nIndex] + b[nIndex];
}
OpenCL Kernel code:
__kernel void
vectorAdd(__global const float * a,
__global const float * b,
__global float * c)
{
// Vector element index
int nIndex = get_global_id(0);
c[nIndex] = a[nIndex] + b[nIndex];
}
CUDA kernel functions are declared using the “__global__”
function modifier
OpenCL kernel functions are declared using “__kernel”.
CUDA Driver API Host code:
const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
CUdevice hDevice;
CUcontext hContext;
CUmodule hModule;
CUfunction hFunction;
// create CUDA device & context
cuInit(0);
cuDeviceGet(&hContext, 0); // pick first device
cuCtxCreate(&hContext, 0, hDevice));
cuModuleLoad(&hModule, “vectorAdd.cubin”);
cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate memory on the device
CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
// copy host vectors to device
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// setup parameter values
cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);
cuParamSeti(cuFunction, 0, pDeviceMemA);
cuParamSeti(cuFunction, 4, pDeviceMemB);
cuParamSeti(cuFunction, 8, pDeviceMemC);
cuParamSetSize(cuFunction, 12);
// execute kernel
cuLaunchGrid(cuFunction, cnBlocks, 1);
// copy the result from device back to host
cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float));
delete[] pA; delete[] pB; delete[] pC;
cuMemFree(pDeviceMemA); cuMemFree(pDeviceMemB); cuMemFree(pDeviceMemC);
OpenCL Host Code:
const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
// create OpenCL device & context
cl_context hContext;
hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
0, 0, 0);
// query all devices available to the context
size_t nContextDescriptorSize;
clGetContextInfo(hContext, CL_CONTEXT_DEVICES,
0, 0, &nContextDescriptorSize);
cl_device_id * aDevices = malloc(nContextDescriptorSize);
clGetContextInfo(hContext, CL_CONTEXT_DEVICES,
nContextDescriptorSize, aDevices, 0);
// create a command queue for first device the context reported
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);
// create & compile program
cl_program hProgram;
hProgram = clCreateProgramWithSource(hContext, 1,
sProgramSource, 0, 0);
clBuildProgram(hProgram, 0, 0, 0, 0, 0);
// create kernel
cl_kernel hKernel;
hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate device memory
cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
hDeviceMemA = clCreateBuffer(hContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension *
sizeof(cl_float), pA, 0);
hDeviceMemC = clCreateBuffer(hContext, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);
// setup parameter values
clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
Personal Aside…
• I’m a bit skeptical…
• 1) slower Source: Matt Harvey Porting CUDA to OpenCL
• 2) NVIDIA has to fully commit…
More Performance notes…
Source: Matt Harvey Porting CUDA to OpenCL