/* * heat diffusion example, as discussed in textbook. * * version that uses OpenCL kernel to do computation. * * disclaimer: performance is quite disappointing, but it's beyond the * scope of this course this year to fix that. * * command line arguments: * * number of points * maximum number of iterations * convergence threshold * number of work items * workgroup size factor (multiply this by "preferred size" to get size) * (optional) CPU to run kernel on CPU (default is GPU) * (optional) filename to print final values (values are not printed if no * filename is given) */ #include #include #include #include #include #include "timer.h" #include "cmdline.h" #include "utility.h" #define LEFTVAL 1.0f #define RIGHTVAL 10.0f #include "heat-eqn-functions.h" #include "heat-eqn-par-functions.h" void initialize_values(float uk[], int nx); const char *kernel_file = "heat-eqn-par-kernel.h"; int main(int argc, char *argv[]) { int nx; int maxsteps; float threshold; int work_items; int workgroup_size_factor; FILE* outfile; cl_int num_points; cl_int work_groups; cl_int points_per_workitem; cl_device_type device_type; cl_float *uk; cl_float dx, dt; cl_float maxdiff; int step; double start_time, end_time; double DBG_time; cl_float *partial_maxdiff_host; cl_int rval; cl_uint compute_units; size_t preferred_workgroup_size, max_workgroup_size; size_t domain_size; size_t workgroup_size; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program_obj; cl_kernel kernel; cl_mem partial_maxdiff; cl_mem uk_buffers[2]; int uk_index, ukp1_index, temp_index; parse_arguments(argc, argv, &nx, &maxsteps, &threshold, &work_items, &workgroup_size_factor, &outfile, &device_type); start_time = get_time(); DBG_time = start_time; /* initialize host variables */ /* uk has nx interior points plus fixed ends */ /* (no need to keep ukp1 in host) */ uk = malloc(sizeof(*uk) * (nx+2)); if (uk == NULL) { fprintf(stderr, "Unable to allocate host memory\n"); return EXIT_FAILURE; } dx = 1.0f/nx; dt = 0.5f*dx*dx; maxdiff = threshold; initialize_values(uk, nx); /* get first device of requested type (exits if none found) */ getDevice(device_type, &device_id); /* create context, command queue, and program object */ char *kernel_source = source_from_file(kernel_file); const char* kernel_sources[] = { kernel_source }; initialize( &device_id, &context, &commands, &program_obj, 1, kernel_sources); printf("Compute device:\n"); output_device_info(stdout, device_id); /* create compute kernel from program object */ kernel = clCreateKernel(program_obj, "do_step", &rval); if ((rval != CL_SUCCESS) || (kernel == NULL)) error_exit("Unable to create compute kernel", rval); /* get info about compute units, work group sizes */ rval = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); if (rval != CL_SUCCESS) { compute_units = 0; describe_error("Could not retrieve device info", rval, stderr); } rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); rval = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL); if (rval != CL_SUCCESS) error_exit("Could not retrieve kernel work group info", rval); /* set workgroup size */ workgroup_size = preferred_workgroup_size * workgroup_size_factor; if (workgroup_size > max_workgroup_size) { fprintf(stderr, "Computed workgroup size %zd exceeds maximum %zd\n", workgroup_size, max_workgroup_size); exit(EXIT_FAILURE); } if ((work_items % workgroup_size) != 0) { fprintf(stderr, "Number of work items %d must divide workgroup size %zd\n", work_items, workgroup_size); exit(EXIT_FAILURE); } work_groups = work_items / workgroup_size; printf("DBG time for initialization %g\n", get_time() - DBG_time); DBG_time = get_time(); /* set constant arguments for kernel */ num_points = nx; points_per_workitem = num_points / work_items; if ((num_points % work_items) != 0) points_per_workitem += 1; /* allocate host memory for partial maxdiff results */ partial_maxdiff_host = malloc(sizeof(*partial_maxdiff_host)*work_groups); if (partial_maxdiff_host == NULL) { fprintf(stderr, "Unable to allocate host memory\n"); return EXIT_FAILURE; } /* allocate device memory for uk (old and new), partial maxdiff results */ uk_buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * (nx+2), NULL, NULL); uk_buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * (nx+2), NULL, NULL); partial_maxdiff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * work_groups, NULL, NULL); if ((uk_buffers[0] == NULL) || (uk_buffers[1] == NULL) || (partial_maxdiff == NULL)) { error_exit("Could not allocate device memory", rval); } printf("DBG time for initialization %g\n", get_time() - DBG_time); DBG_time = get_time(); /* copy uk, ukp1 data to device */ uk_index = 0; ukp1_index = 1; rval = clEnqueueWriteBuffer(commands, uk_buffers[uk_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy uk to device memory", rval); rval = clEnqueueWriteBuffer(commands, uk_buffers[ukp1_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy ukp1 to device memory", rval); printf("DBG time for copy to device %g\n", get_time() - DBG_time); DBG_time = get_time(); /* set up to execute kernel over entire range of indices */ domain_size = work_groups * workgroup_size; /* set arguments to compute kernel */ rval = 0; rval |= clSetKernelArg(kernel, 0, sizeof(num_points), &num_points); rval |= clSetKernelArg(kernel, 1, sizeof(points_per_workitem), &points_per_workitem); rval |= clSetKernelArg(kernel, 2, sizeof(dx), &dx); rval |= clSetKernelArg(kernel, 3, sizeof(dt), &dt); /* fill in args 4 and 5 (uk, ukp1) later -- will change on every step */ rval |= clSetKernelArg(kernel, 6, sizeof(cl_float)*workgroup_size, NULL); rval |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &partial_maxdiff); if (rval != CL_SUCCESS) error_exit("Unable to set kernel arguments", rval); for (step = 0; (step < maxsteps) && (maxdiff >= threshold); ++step) { /* set remaining arguments to compute kernel */ rval = 0; rval |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &uk_buffers[uk_index]); rval |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &uk_buffers[ukp1_index]); if (rval != CL_SUCCESS) error_exit("Unable to set kernel arguments", rval); /* run kernel to compute new value and maximum difference */ rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &domain_size, &workgroup_size, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not execute kernel", rval); /* read and combine partial maxdiffs */ rval = clEnqueueReadBuffer(commands, partial_maxdiff, CL_TRUE, 0, sizeof(cl_float) * work_groups, partial_maxdiff_host, 0, NULL, NULL ); if (rval != CL_SUCCESS) error_exit("Could not read diffs from device", rval); maxdiff = 0.0; for (int i=0; i < (int)work_groups; ++i) if (partial_maxdiff_host[i] > maxdiff) maxdiff = partial_maxdiff_host[i]; /* "copy" ukp1 to uk by swapping pointers */ temp_index = uk_index; uk_index = ukp1_index; ukp1_index = temp_index; } printf("DBG time for calculation %g\n", get_time() - DBG_time); DBG_time = get_time(); /* copy uk data from device */ rval = clEnqueueReadBuffer(commands, uk_buffers[uk_index], CL_TRUE, 0, sizeof(cl_float) * (nx+2), uk, 0, NULL, NULL); if (rval != CL_SUCCESS) error_exit("Could not copy uk from device memory", rval); printf("DBG time for copy from device %g\n", get_time() - DBG_time); DBG_time = get_time(); end_time = get_time(); if (outfile != NULL) { print_values(outfile, uk, nx); fclose(outfile); } puts(""); printf("OpenCL program with %d work groups of size %zd\n", work_groups, workgroup_size); printf("nx = %d, maxsteps = %d, threshold = %g\n", nx, maxsteps, threshold); if (maxdiff < threshold) { printf("converged in %d iterations\n", step); } else { printf("failed to converge in %d iterations, maxdiff = %g\n", step, maxdiff); } printf("execution time = %g\n", end_time - start_time); /* clean up and end */ free(uk); free(partial_maxdiff_host); clReleaseMemObject(uk_buffers[0]); clReleaseMemObject(uk_buffers[1]); clReleaseMemObject(partial_maxdiff); clReleaseKernel(kernel); finalize(context, commands, program_obj); return EXIT_SUCCESS; }