/* * heat diffusion example, as discussed in textbook. * * version that uses OpenCL kernel to do computation. * * disclaimer: performance is quite disappointing, but it's beyond the * scope of this course this year to fix that. * * command line arguments: * * number of points * maximum number of iterations * convergence threshold * points per workitem * workgroup size factor (N>0 to use N times the preferred/quantum size, * up to maximum) * (optional) filename to print final values (values are not printed if no * filename is given) * (optional) CPU to run kernel on CPU (default is GPU) */ #include #include #include #include #include #include "timer.h" #include "cmdline.h" #include "utility.h" #define LEFTVAL 1.0f #define RIGHTVAL 10.0f #include "heat-eqn-functions.h" #include "heat-eqn-par-functions.h" void initialize_values(float uk[], int nx); /* * kernel for computational device: * * compute next values, maximum difference */ const char *kernel_source = "\n" \ "__kernel void do_step( \n" \ " const int num_points, \n" \ " const int points_per_workitem, \n" \ " const float dx, \n" \ " const float dt, \n" \ " __global float* uk, \n" \ " __global float* ukp1, \n" \ " __local float* local_maxdiff, \n" \ " __global float* partial_maxdiff) \n" \ "{ \n" \ " int local_work_items = get_local_size(0); \n" \ " int local_id = get_local_id(0); \n" \ " int group_id = get_group_id(0); \n" \ " int global_id = get_global_id(0); \n" \ " /* give each work item a contiguous range of points */ \n" \ " int loop_start = 1+ global_id * points_per_workitem; \n" \ " int loop_limit = loop_start + points_per_workitem; \n" \ " if (loop_limit > (num_points+1)) loop_limit = num_points+1; \n" \ " /* compute new values */ \n" \ " for (int i = loop_start; i < loop_limit; ++i) { \n" \ " ukp1[i]=uk[i]+ (dt/(dx*dx))*(uk[i+1]-2*uk[i]+uk[i-1]); \n" \ " } \n" \ " /* check for convergence */ \n" \ " float accum = 0.0f; \n" \ " for (int i = loop_start; i < loop_limit; ++i) { \n" \ " float diff = fabs(uk[i] - ukp1[i]); \n" \ " accum = (diff > accum) ? diff : accum; \n" \ " } \n" \ " local_maxdiff[local_id] = accum; \n" \ " /* combine convergence results */ \n" \ " barrier(CLK_LOCAL_MEM_FENCE); \n" \ " if (local_id == 0) { \n" \ " float maxdiff = 0.0f; \n" \ " for(int i = 0; i < local_work_items; ++i) { \n" \ " if (local_maxdiff[i] > maxdiff) maxdiff = local_maxdiff[i]; \n" \ " } \n" \ " partial_maxdiff[group_id] = maxdiff; \n" \ " } \n" \ "} \n" \ "\n"; int main(int argc, char *argv[]) { int nx; int maxsteps; float threshold; cl_int num_points; cl_int work_groups; cl_int points_per_workitem; cl_int workgroup_size_factor; FILE* outfile; cl_device_type device_type; cl_float *uk; cl_float dx, dt; cl_float maxdiff; int step; double start_time, end_time; double DBG_time; cl_float *partial_maxdiff_host; cl_int rval; size_t workgroup_size_quantum, max_workgroup_size; size_t domain_size[1]; /* global domain size */ size_t workgroup_size[1]; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program_obj; cl_kernel kernel; cl_mem partial_maxdiff; cl_mem uk_buffers[2]; int uk_index, ukp1_index, temp_index; parse_arguments(argc, argv, &nx, &maxsteps, &threshold, &points_per_workitem, &workgroup_size_factor, &outfile, &device_type); start_time = get_time(); DBG_time = start_time; /* initialize host variables */ /* uk has nx interior points plus fixed ends */ /* (no need to keep ukp1 in host) */ uk = (float *) /* needed when compiling as C++ */ malloc(sizeof(*uk) * (nx+2)); if (uk == NULL) { fprintf(stderr, "Unable to allocate host memory\n"); return EXIT_FAILURE; } dx = 1.0f/nx; dt = 0.5f*dx*dx; maxdiff = threshold; initialize_values(uk, nx); /* get first device of requested type (exits if none found) */ getDevice(device_type, &device_id); /* create context, command queue, and program object */ const char* kernel_sources[] = { kernel_source }; initialize(&device_id, &context, &commands, &program_obj, 1, kernel_sources); /* create compute kernel from program object */ kernel = clCreateKernel(program_obj, "do_step", &rval); if ((rval != CL_SUCCESS) || (kernel == NULL)) error_exit("Unable to create compute kernel", rval); /* get info about work group sizes */ rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(workgroup_size_quantum), &workgroup_size_quantum, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); /* set workgroup size */ workgroup_size[0] = workgroup_size_quantum * workgroup_size_factor; if (workgroup_size[0] > max_workgroup_size) { fprintf(stderr, "Computed workgroup size %d exceeds maximum %d\n", (int)workgroup_size[0], (int)max_workgroup_size); exit(EXIT_FAILURE); } printf("\nDBG time for initialization %g\n", get_time() - DBG_time); DBG_time = get_time(); /* set number of work groups, number of points (for use by kernel) */ int points_per_workgroup = workgroup_size[0]*points_per_workitem; work_groups = (nx+points_per_workgroup-1)/points_per_workgroup; if (work_groups < 1) work_groups = 1; num_points = nx; /* allocate host memory for partial maxdiff results */ partial_maxdiff_host = (cl_float *) /* needed when compiling as C++ */ malloc(sizeof(*partial_maxdiff_host)*work_groups); if (partial_maxdiff_host == NULL) { fprintf(stderr, "Unable to allocate host memory\n"); return EXIT_FAILURE; } /* allocate device memory for uk (old and new), partial maxdiff results */ uk_buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * (nx+2), NULL, NULL); uk_buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * (nx+2), NULL, NULL); partial_maxdiff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * work_groups, NULL, NULL); if ((uk_buffers[0] == NULL) || (uk_buffers[1] == NULL) || (partial_maxdiff == NULL)) { describe_error("Unable to allocate device memory", rval, stderr); fprintf(stderr, "Total space needed is %d\n", (int)(sizeof(cl_float) * ((nx+2)*2 + work_groups))); exit(EXIT_FAILURE); } printf("DBG time for initialization %g\n", get_time() - DBG_time); DBG_time = get_time(); /* copy uk, ukp1 data to device */ uk_index = 0; ukp1_index = 1; rval = clEnqueueWriteBuffer(commands, uk_buffers[uk_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy uk to device memory", rval); rval = clEnqueueWriteBuffer(commands, uk_buffers[ukp1_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy ukp1 to device memory", rval); printf("DBG time for copy to device %g\n", get_time() - DBG_time); DBG_time = get_time(); /* set up to execute kernel over entire range of indices */ domain_size[0] = work_groups * workgroup_size[0]; /* set arguments to compute kernel */ rval = 0; rval |= clSetKernelArg(kernel, 0, sizeof(num_points), &num_points); rval |= clSetKernelArg(kernel, 1, sizeof(points_per_workitem), &points_per_workitem); rval |= clSetKernelArg(kernel, 2, sizeof(dx), &dx); rval |= clSetKernelArg(kernel, 3, sizeof(dt), &dt); /* fill in args 4 and 5 (uk, ukp1) later -- will change on every step */ rval |= clSetKernelArg(kernel, 6, sizeof(cl_float)*workgroup_size[0], NULL); rval |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &partial_maxdiff); if (rval != CL_SUCCESS) error_exit("Unable to set kernel arguments", rval); for (step = 0; (step < maxsteps) && (maxdiff >= threshold); ++step) { /* set remaining arguments to compute kernel */ rval = 0; rval |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &uk_buffers[uk_index]); rval |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &uk_buffers[ukp1_index]); if (rval != CL_SUCCESS) error_exit("Unable to set kernel arguments", rval); /* run kernel to compute new value and maximum difference */ rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, domain_size, workgroup_size, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not execute kernel", rval); /* read and combine partial maxdiffs */ rval = clEnqueueReadBuffer(commands, partial_maxdiff, CL_TRUE, 0, sizeof(cl_float) * work_groups, partial_maxdiff_host, 0, NULL, NULL ); if (rval != CL_SUCCESS) error_exit("Could not read diffs from device", rval); maxdiff = 0.0; for (int i=0; i < (int)work_groups; ++i) if (partial_maxdiff_host[i] > maxdiff) maxdiff = partial_maxdiff_host[i]; /* "copy" ukp1 to uk by swapping pointers */ temp_index = uk_index; uk_index = ukp1_index; ukp1_index = temp_index; } printf("DBG time for calculation %g\n", get_time() - DBG_time); DBG_time = get_time(); /* copy uk data from device */ rval = clEnqueueReadBuffer(commands, uk_buffers[uk_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy uk from device memory", rval); printf("DBG time for copy from device %g\n", get_time() - DBG_time); DBG_time = get_time(); end_time = get_time(); if (outfile != NULL) { print_values(outfile, uk, nx); fclose(outfile); } printf("OpenCL program using device:\n"); output_device_info(stdout, device_id, CL_FALSE); printf("%d points per work item, %d work groups of size %d, %d work items\n", points_per_workitem, work_groups, (int)workgroup_size[0], work_groups * (int)workgroup_size[0]); printf("nx = %d, maxsteps = %d, threshold = %g\n", nx, maxsteps, threshold); if (maxdiff < threshold) { printf("converged in %d iterations\n", step); } else { printf("failed to converge in %d iterations, maxdiff = %g\n", step, maxdiff); } printf("execution time = %g\n", end_time - start_time); /* clean up and end */ free(uk); free(partial_maxdiff_host); clReleaseMemObject(uk_buffers[0]); clReleaseMemObject(uk_buffers[1]); clReleaseMemObject(partial_maxdiff); clReleaseKernel(kernel); finalize(context, commands, program_obj); return EXIT_SUCCESS; }