/*
 * numerical integration example, as discussed in textbook:  
 *
 * compute pi by approximating the area under the curve f(x) = 4 / (1 + x*x)
 * between 0 and 1.
 *
 * version that uses OpenCL kernel to do computation, as discussed in 
 * Appendix D, but adds command-line arguments to allow varying some things.
 * 
 * command-line arguments:
 *
 * number of steps
 * number of work items
 * workgroup size factor (multiply this by "preferred size" to get size)
 * (optional) CPU to run kernel on CPU (default is GPU)
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
/* copied from not-strictly-standard part of math.h */
#define M_PI		3.14159265358979323846

#include <CL/cl.h>
#include "timer.h"
#include "utility.h"

const char *kernel_file = "num-int-par-file-kernel.h";

/* main program */
int main(int argc, char *argv[]) {

    char* usage_fmt = "usage:  %s "
        "number_of_steps num_work_items workgroup_size_factor [CPU]\n";

    /* process command-line arguments */
    char* end_ptr_for_strtol;
    if (argc < 4)  {
        fprintf(stderr, usage_fmt, argv[0]);
        exit(EXIT_FAILURE);
    }
    long num_steps = strtol(argv[1], &end_ptr_for_strtol, 10);
    if (*end_ptr_for_strtol != '\0') {
        fprintf(stderr, usage_fmt, argv[0]);
        exit(EXIT_FAILURE);
    }

    long num_work_items = strtol(argv[2], &end_ptr_for_strtol, 10);
    if (*end_ptr_for_strtol != '\0') {
        fprintf(stderr, usage_fmt, argv[0]);
        exit(EXIT_FAILURE);
    }

    long workgroup_size_factor = strtol(argv[3], &end_ptr_for_strtol, 10);
    if (*end_ptr_for_strtol != '\0') {
        fprintf(stderr, usage_fmt, argv[0]);
        exit(EXIT_FAILURE);
    }

    cl_device_type device_type = CL_DEVICE_TYPE_GPU;
    if ((argc > 4) && (strcmp(argv[4], "CPU") == 0)) {
        device_type = CL_DEVICE_TYPE_CPU;
    }

    double start_time, end_time;
    cl_float pi;
    cl_int nsteps;
    cl_int work_groups;
    cl_float step_size;
    cl_float *partial_sums_host; /* partial sums from compute device */

    cl_int rval;

    cl_uint compute_units;

    size_t preferred_workgroup_size, max_workgroup_size;
    size_t domain_size;
    size_t workgroup_size;

    cl_device_id device_id;
    cl_context context;
    cl_command_queue commands;
    cl_program program_obj;
    cl_kernel kernel;

    cl_mem partial_sums; /* device memory for partial sums */

    /* record start time */
    start_time = get_time();

    /* get first GPU device (exits if none found) */
    getDevice(device_type, &device_id);

    /* create context, command queue, and program object */
    char *kernel_source = source_from_file(kernel_file);
    const char* kernel_sources[] = { kernel_source };
    initialize(
            &device_id, &context, &commands, &program_obj, 1, kernel_sources);
    printf("Compute device:\n");
    output_device_info(stdout, device_id);
    puts("");
 
    /* create compute kernel from program object */
    kernel = clCreateKernel(program_obj, "pi", &rval);
    if ((rval != CL_SUCCESS) || (kernel == NULL))
        error_exit("Could not create compute kernel", rval);

    /* get info about compute units, work group sizes */
    rval = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,
            sizeof(compute_units), &compute_units, NULL);
    if (rval != CL_SUCCESS) {
        compute_units = 0;
        describe_error("Could not retrieve device info", rval, stderr);
    }
    rval = clGetKernelWorkGroupInfo(kernel, device_id, 
            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
            sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
    if (rval != CL_SUCCESS) 
        error_exit("Could not retrieve kernel work group info", rval);
    rval = clGetKernelWorkGroupInfo(kernel, device_id, 
            CL_KERNEL_WORK_GROUP_SIZE, 
            sizeof(max_workgroup_size), &max_workgroup_size, NULL);
    if (rval != CL_SUCCESS) 
        error_exit("Could not retrieve kernel work group info", rval); 

    /* set workgroup size */
    workgroup_size = preferred_workgroup_size * workgroup_size_factor;
    if (workgroup_size > max_workgroup_size) {
        fprintf(stderr, "Computed workgroup size %zd exceeds maximum %zd\n",
                workgroup_size, max_workgroup_size);
        exit(EXIT_FAILURE);
    }
    if ((num_work_items % workgroup_size) != 0) {
        fprintf(stderr, 
                "Number of work items %zd must divide workgroup size %zd\n",
                num_work_items, workgroup_size);
        exit(EXIT_FAILURE);
    }
    work_groups = num_work_items / workgroup_size;

    /* set constant arguments for kernel */
    nsteps = num_steps;
    step_size = 1.0f/(cl_float)nsteps;

    /* allocate space for partial sums on host */
    partial_sums_host = malloc(sizeof(*partial_sums_host)*work_groups);
    if (partial_sums_host == NULL) 
        error_exit("Could not allocate memory for partial sums", CL_SUCCESS);

    /* create output buffer to hold partial sums */
    partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
            sizeof(cl_float) * work_groups, NULL, NULL);
    if (partial_sums == NULL)
        error_exit("Could not allocate device memory", rval);

    /* set arguments to compute kernel */
    /* note argument 2 is a local array so all we do here is define its size */
    rval = 0;
    rval  = clSetKernelArg(kernel, 0, sizeof(nsteps), &nsteps);
    rval |= clSetKernelArg(kernel, 1, sizeof(step_size), &step_size);
    rval |= clSetKernelArg(kernel, 2, sizeof(cl_float)*workgroup_size, NULL);
    rval |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &partial_sums);
    if (rval != CL_SUCCESS)
        error_exit("Could not set kernel arguments", rval);

    /* execute kernel over entire range of 1d input data set */
    domain_size = work_groups * workgroup_size;
    rval = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, 
            &domain_size, &workgroup_size, 0, NULL, NULL);
    if (rval != CL_SUCCESS)
        error_exit("Could not execute kernel", rval); 

    /* wait for commands to complete before reading back results */
    clFinish(commands);

    /* read back results from compute device */
    rval = clEnqueueReadBuffer(commands, partial_sums, CL_TRUE, 0, 
            sizeof(cl_float) * work_groups, partial_sums_host, 0, NULL, NULL );  
    if (rval != CL_SUCCESS)
        error_exit("Could not read output array", rval);

    /* complete sum and compute final integral value */
    pi = 0.0;
    for (int i=0; i < (int)work_groups; ++i)
        pi += partial_sums_host[i];
    pi *= step_size;

    /* record end time */
    end_time = get_time();

    /* print results */
    printf("OpenCL program result with %ld steps, "
            "%ld work groups of size %zd\n",
            num_steps,
            (long)work_groups, workgroup_size
          );
    printf("computed pi = %g  (%17.15f)\n",pi, pi);
    printf("difference between computed pi and math.h M_PI = %17.15f\n", 
            fabs(pi - M_PI));
    printf("time to compute = %g seconds\n", end_time - start_time);

    /* clean up then shut down */
    free(kernel_source);
    free(partial_sums_host);
    clReleaseMemObject(partial_sums);
    clReleaseKernel(kernel);
    finalize(context, commands, program_obj);
 
    return EXIT_SUCCESS;
}