/* * numerical integration example, as discussed in textbook: * * compute pi by approximating the area under the curve f(x) = 4 / (1 + x*x) * between 0 and 1. * * version that uses OpenCL kernel to do computation, as discussed in * Appendix D, but adds command-line arguments to allow varying some things. * * command-line arguments: * * number of steps * number of work items * workgroup size factor (multiply this by "preferred size" to get size) * (optional) CPU to run kernel on CPU (default is GPU) */ #include #include #include #include /* copied from not-strictly-standard part of math.h */ #define M_PI 3.14159265358979323846 #include #include "timer.h" #include "utility.h" const char *kernel_file = "num-int-par-file-kernel.h"; /* main program */ int main(int argc, char *argv[]) { char* usage_fmt = "usage: %s " "number_of_steps num_work_items workgroup_size_factor [CPU]\n"; /* process command-line arguments */ char* end_ptr_for_strtol; if (argc < 4) { fprintf(stderr, usage_fmt, argv[0]); exit(EXIT_FAILURE); } long num_steps = strtol(argv[1], &end_ptr_for_strtol, 10); if (*end_ptr_for_strtol != '\0') { fprintf(stderr, usage_fmt, argv[0]); exit(EXIT_FAILURE); } long num_work_items = strtol(argv[2], &end_ptr_for_strtol, 10); if (*end_ptr_for_strtol != '\0') { fprintf(stderr, usage_fmt, argv[0]); exit(EXIT_FAILURE); } long workgroup_size_factor = strtol(argv[3], &end_ptr_for_strtol, 10); if (*end_ptr_for_strtol != '\0') { fprintf(stderr, usage_fmt, argv[0]); exit(EXIT_FAILURE); } cl_device_type device_type = CL_DEVICE_TYPE_GPU; if ((argc > 4) && (strcmp(argv[4], "CPU") == 0)) { device_type = CL_DEVICE_TYPE_CPU; } double start_time, end_time; cl_float pi; cl_int nsteps; cl_int work_groups; cl_float step_size; cl_float *partial_sums_host; /* partial sums from compute device */ cl_int rval; cl_uint compute_units; size_t preferred_workgroup_size, max_workgroup_size; size_t domain_size; size_t workgroup_size; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program_obj; cl_kernel kernel; cl_mem partial_sums; /* device memory for partial sums */ /* record start time */ start_time = get_time(); /* get first GPU device (exits if none found) */ getDevice(device_type, &device_id); /* create context, command queue, and program object */ char *kernel_source = source_from_file(kernel_file); const char* kernel_sources[] = { kernel_source }; initialize( &device_id, &context, &commands, &program_obj, 1, kernel_sources); printf("Compute device:\n"); output_device_info(stdout, device_id); puts(""); /* create compute kernel from program object */ kernel = clCreateKernel(program_obj, "pi", &rval); if ((rval != CL_SUCCESS) || (kernel == NULL)) error_exit("Could not create compute kernel", rval); /* get info about compute units, work group sizes */ rval = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); if (rval != CL_SUCCESS) { compute_units = 0; describe_error("Could not retrieve device info", rval, stderr); } rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); /* set workgroup size */ workgroup_size = preferred_workgroup_size * workgroup_size_factor; if (workgroup_size > max_workgroup_size) { fprintf(stderr, "Computed workgroup size %zd exceeds maximum %zd\n", workgroup_size, max_workgroup_size); exit(EXIT_FAILURE); } if ((num_work_items % workgroup_size) != 0) { fprintf(stderr, "Number of work items %zd must divide workgroup size %zd\n", num_work_items, workgroup_size); exit(EXIT_FAILURE); } work_groups = num_work_items / workgroup_size; /* set constant arguments for kernel */ nsteps = num_steps; step_size = 1.0f/(cl_float)nsteps; /* allocate space for partial sums on host */ partial_sums_host = malloc(sizeof(*partial_sums_host)*work_groups); if (partial_sums_host == NULL) error_exit("Could not allocate memory for partial sums", CL_SUCCESS); /* create output buffer to hold partial sums */ partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * work_groups, NULL, NULL); if (partial_sums == NULL) error_exit("Could not allocate device memory", rval); /* set arguments to compute kernel */ /* note argument 2 is a local array so all we do here is define its size */ rval = 0; rval = clSetKernelArg(kernel, 0, sizeof(nsteps), &nsteps); rval |= clSetKernelArg(kernel, 1, sizeof(step_size), &step_size); rval |= clSetKernelArg(kernel, 2, sizeof(cl_float)*workgroup_size, NULL); rval |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &partial_sums); if (rval != CL_SUCCESS) error_exit("Could not set kernel arguments", rval); /* execute kernel over entire range of 1d input data set */ domain_size = work_groups * workgroup_size; rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &domain_size, &workgroup_size, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not execute kernel", rval); /* wait for commands to complete before reading back results */ clFinish(commands); /* read back results from compute device */ rval = clEnqueueReadBuffer(commands, partial_sums, CL_TRUE, 0, sizeof(cl_float) * work_groups, partial_sums_host, 0, NULL, NULL ); if (rval != CL_SUCCESS) error_exit("Could not read output array", rval); /* complete sum and compute final integral value */ pi = 0.0; for (int i=0; i < (int)work_groups; ++i) pi += partial_sums_host[i]; pi *= step_size; /* record end time */ end_time = get_time(); /* print results */ printf("OpenCL program result with %ld steps, " "%ld work groups of size %zd\n", num_steps, (long)work_groups, workgroup_size ); printf("computed pi = %g (%17.15f)\n",pi, pi); printf("difference between computed pi and math.h M_PI = %17.15f\n", fabs(pi - M_PI)); printf("time to compute = %g seconds\n", end_time - start_time); /* clean up then shut down */ free(kernel_source); free(partial_sums_host); clReleaseMemObject(partial_sums); clReleaseKernel(kernel); finalize(context, commands, program_obj); return EXIT_SUCCESS; }