/** * Parallel program to test/time block-based matrix-multiplication routine * (distributed-memory version). * * Version using additional communicators to broadcast. * * Command-line arguments: * size of matrices to multiply * [ number of blocks not specified -- inferred from number of processes ] * optional base filename for output values (not printed if not * given); filenames combine this value with process number */ #include #include #include #include #define blockstart(M,i,j,rows_per_blk,cols_per_blk,stride) \ (M + ((i)*(rows_per_blk))*(stride) + (j)*(cols_per_blk)) #include "timer.h" #include "cmdline.h" #include "utility.h" #include "matrix-mult-par-utility.h" #include "matrix-mult-par-initfortest.h" #include "matrix-mult-par-print.h" int main(int argc, char *argv[]) { double start_time, end_time; int N = 0; /* input size */ int NB = 0; /* number of blocks */ int dimN, dimP, dimM; /* matrix dimensions */ double *A, *B, *C; /* matrices */ int dimNb, dimPb, dimMb; /* block dimensions */ int myID, nprocs; int myID_i, myID_j; /* block indices */ double *Abuffer, *Bbuffer; /* for blocks of A, B from other procs */ FILE* outfile = NULL; MPI_Comm row_comm; MPI_Comm col_comm; /* MPI initialization */ MPI_Init(&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myID); /* command-line arguments */ char *usage_msg = "usage is %s size [outfile]\n"; N = get_integer_arg_extended(argc, argv, 1, 1, "size", usage_msg, true, (void (*)(void))MPI_Finalize); NB = (int) sqrt(nprocs); if ((NB*NB) != nprocs) { error_exit("number of processes must be a perfect square\n"); } if ((N % NB) != 0) { error_exit("sqrt(nprocs) evenly divide size\n"); } if (argc > 2) { outfile = makeOutfile(argv[2], myID); if (outfile == NULL) { fprintf(stderr, usage_msg, argv[0]); error_exit(NULL); } } dimN = dimP = dimM = N; dimNb = dimN / NB; dimPb = dimP / NB; dimMb = dimM / NB; myID_i = myID / NB; myID_j = myID % NB; A = malloc(dimNb*dimPb*sizeof(*A)); B = malloc(dimPb*dimMb*sizeof(*B)); C = malloc(dimNb*dimMb*sizeof(*C)); Abuffer = malloc(dimNb*dimPb*sizeof(*Abuffer)); Bbuffer = malloc(dimPb*dimMb*sizeof(*Bbuffer)); if ((A == NULL) || (B == NULL) || (C == NULL) || (Abuffer == NULL) || (Bbuffer == NULL)) { fprintf(stderr, "unable to allocate space for matrices of size %d\n", dimN); return EXIT_FAILURE; } /* * Set up additional communicators so we can broadcast within rows, * columns of process grid. */ MPI_Comm_split(MPI_COMM_WORLD, myID_i, myID_j, &row_comm); MPI_Comm_split(MPI_COMM_WORLD, myID_j, myID_i, &col_comm); int temp1, temp2; MPI_Comm_rank(row_comm, &temp1); MPI_Comm_rank(col_comm, &temp2); /* Initialize matrices */ initialize(A, B, dimNb, dimPb, dimMb, NB, myID_i, myID_j); /* Do the multiply */ /* could use MPI_Wtime but get_time is what sequential code uses */ start_time = get_time(); matclear(C, dimNb, dimMb, dimMb); for (int kb=0; kb < NB; ++kb) { /* * broadcast block of A to other processes in same row of * process grid */ if (myID_j == kb) { /* copy A to Abuffer */ memcpy(Abuffer, A, dimNb*dimPb*sizeof(*A)); } MPI_Bcast(Abuffer, dimNb*dimPb, MPI_DOUBLE, kb, row_comm); /* * broadcast block of B to other processes in same column of * process grid */ if (myID_i == kb) { /* copy A to Abuffer */ memcpy(Bbuffer, B, dimPb*dimMb*sizeof(*A)); } MPI_Bcast(Bbuffer, dimPb*dimMb, MPI_DOUBLE, kb, col_comm); /* * compute product of block[ib][kb] of A and block[kb][jb] of B * and add to block[ib][jb] of C */ matmul_add( Abuffer, Bbuffer, C, dimNb, dimPb, dimMb, dimPb, dimMb, dimMb ); } end_time = get_time(); /* Print results */ if (outfile != NULL) { printMatrix(outfile, "A", A, dimNb, dimPb, dimPb, myID_i, myID_j); printMatrix(outfile, "B", B, dimPb, dimMb, dimMb, myID_i, myID_j); printMatrix(outfile, "A*B", C, dimNb, dimMb, dimMb, myID_i, myID_j); fclose(outfile); } /* Print timing results */ if (myID == 0) { fprintf(stdout, "Block-based program, parallel (v2) with %d processes\n", nprocs); fprintf(stdout, "Size = %d, numblocks = %d, time for multiplication = %g\n", N, NB, end_time - start_time); } /* MPI cleanup */ MPI_Comm_free(&row_comm); MPI_Comm_free(&col_comm); MPI_Finalize(); free(A); free(B); free(C); free(Abuffer); free(Bbuffer); return EXIT_SUCCESS; }