/* * numerical integration example, as discussed in textbook: * * compute pi by approximating the area under the curve f(x) = 4 / (1 + x*x) * between 0 and 1. * * version that uses OpenCL kernel to do computation, as discussed in draft * Appendix D. */ #include #include #include #include /* copied from not-strictly-standard part of math.h */ #define M_PI 3.14159265358979323846 #include #include "timer.h" #include "cmdline.h" #include "utility.h" #define NUM_STEPS 400000000 #define STEPS_PER_WORKITEM 1000 /* * kernel for computational device: * * compute partial sum for computation of pi */ const char *kernel_source = "\n" \ "__kernel void pi( \n" \ " const int nsteps, \n" \ " const int work_groups, \n" \ " const float step_size, \n" \ " __local float* local_sums, \n" \ " __global float* partial_sums) \n" \ "{ \n" \ " int num_work_items = get_local_size(0); \n" \ " int local_id = get_local_id(0); \n" \ " int group_id = get_group_id(0); \n" \ " int global_id = get_global_id(0); \n" \ " float x, sum, accum = 0.0; \n" \ " int iskip = num_work_items * work_groups; \n" \ " for(int i = global_id; i < nsteps; i += iskip) { \n" \ " x = (i+0.5f)*step_size; \n" \ " accum += 4.0f/(1.0f+x*x); \n" \ " } \n" \ " local_sums[local_id] = accum; \n" \ " barrier(CLK_LOCAL_MEM_FENCE); \n" \ " if (local_id == 0) { \n" \ " sum = 0.0f; \n" \ " for(int i = 0; i < num_work_items; ++i) { \n" \ " sum += local_sums[i]; \n" \ " } \n" \ " partial_sums[group_id] = sum; \n" \ " } \n" \ "} \n" \ "\n"; /* main program */ int main(int argc, char *argv[]) { double start_time, end_time; cl_float pi; cl_int nsteps; cl_int work_groups; cl_float step_size; cl_float *partial_sums_host; /* partial sums from compute device */ cl_int rval; cl_uint compute_units; size_t domain_size[1]; /* global domain size */ size_t workgroup_size[1]; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program_obj; cl_kernel kernel; cl_mem partial_sums; /* device memory for partial sums */ /* record start time */ start_time = get_time(); /* get first GPU device (exits if none found) */ getDevice(CL_DEVICE_TYPE_GPU, &device_id); /* create context, command queue, and program object */ const char* kernel_sources[] = { kernel_source }; initialize(&device_id, &context, &commands, &program_obj, 1, kernel_sources); /* create compute kernel from program object */ kernel = clCreateKernel(program_obj, "pi", &rval); if ((rval != CL_SUCCESS) || (kernel == NULL)) error_exit("Could not create compute kernel", rval); /* get info about compute units, work group sizes */ rval = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); if (rval != CL_SUCCESS) { compute_units = 0; describe_error("Could not retrieve device info", rval, stderr); } rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(workgroup_size[0]), workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); /* set number of work groups, number of steps, step size */ cl_int iterations_per_workgroup = workgroup_size[0]*STEPS_PER_WORKITEM; work_groups = (NUM_STEPS+iterations_per_workgroup-1)/iterations_per_workgroup; if (work_groups < 1) work_groups = 1; nsteps = NUM_STEPS; step_size = 1.0f/(cl_float)nsteps; partial_sums_host = (cl_float *) /* needed when compiling as C++ */ malloc(sizeof(*partial_sums_host)*work_groups); if (partial_sums_host == NULL) error_exit("Could not allocate memory for partial sums", CL_SUCCESS); /* create output buffer to hold partial sums */ partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * work_groups, NULL, NULL); if (partial_sums == NULL) error_exit("Could not allocate device memory", rval); /* set arguments to compute kernel */ /* note argument 2 is a local array so all we do here is define its size */ rval = 0; rval = clSetKernelArg(kernel, 0, sizeof(nsteps), &nsteps); rval = clSetKernelArg(kernel, 1, sizeof(work_groups), &work_groups); rval |= clSetKernelArg(kernel, 2, sizeof(step_size), &step_size); rval |= clSetKernelArg(kernel, 3, sizeof(cl_float)*workgroup_size[0], NULL); rval |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &partial_sums); if (rval != CL_SUCCESS) error_exit("Could not set kernel arguments", rval); /* execute kernel over entire range of 1d input data set using maximum number of work group items for device */ domain_size[0] = work_groups * workgroup_size[0]; rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, domain_size, workgroup_size, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not execute kernel", rval); /* wait for commands to complete before reading back results */ clFinish(commands); /* read back results from compute device */ rval = clEnqueueReadBuffer(commands, partial_sums, CL_TRUE, 0, sizeof(cl_float) * work_groups, partial_sums_host, 0, NULL, NULL ); if (rval != CL_SUCCESS) error_exit("Could not read output array", rval); /* complete sum and compute final integral value */ pi = 0.0; for (int i=0; i < (int)work_groups; ++i) pi += partial_sums_host[i]; pi *= step_size; /* record end time */ end_time = get_time(); /* print results */ printf("OpenCL program result with %d steps, %d iterations per work item\n", nsteps, STEPS_PER_WORKITEM); printf(" %d compute units, %d work groups of size %d, %d work items\n", compute_units, work_groups, (int)workgroup_size[0], work_groups * (int)workgroup_size[0]); printf("computed pi = %g (%.15f)\n",pi, pi); printf("difference between computed pi and math.h M_PI = %.15f\n", fabs(pi - M_PI)); printf("time to compute = %g seconds\n", end_time - start_time); /* clean up then shut down */ free(partial_sums_host); clReleaseMemObject(partial_sums); clReleaseKernel(kernel); finalize(context, commands, program_obj); return EXIT_SUCCESS; }