/** * Parallel program to test/time block-based matrix-multiplication routine * (distributed-memory version). * * Command-line arguments: * size of matrices to multiply * number of blocks (in each dimension -- e.g., 2 means split matrices * into 4 blocks each -- and number of blocks must match number of * processes) * optional base filename for output values (not printed if not * given); filenames combine this value with process number */ #include #include #include #define blockstart(M,i,j,rows_per_blk,cols_per_blk,stride) \ (M + ((i)*(rows_per_blk))*(stride) + (j)*(cols_per_blk)) #include "timer.h" #include "cmdline.h" #include "utility.h" #include "matrix-mult-par-utility.h" #include "matrix-mult-par-initfortest.h" #include "matrix-mult-par-print.h" int main(int argc, char *argv[]) { double start_time, end_time; int N = 0; /* input size */ int NB = 0; /* number of blocks */ int dimN, dimP, dimM; /* matrix dimensions */ double *A, *B, *C; /* matrices */ int dimNb, dimPb, dimMb; /* block dimensions */ int myID, nprocs; int myID_i, myID_j; /* block indices */ double *Abuffer, *Bbuffer; /* for blocks of A, B from other procs */ MPI_Status status; FILE* outfile = NULL; /* MPI initialization */ MPI_Init(&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myID); /* command-line arguments */ char *usage_msg = "usage is %s size numblocks\n"; N = get_integer_arg_extended(argc, argv, 1, 1, "size", usage_msg, true, (void (*)(void))MPI_Finalize); NB = get_integer_arg_extended(argc, argv, 2, 1, "numblocks", usage_msg, true, (void (*)(void))MPI_Finalize); if ((N % NB) != 0) { error_exit("numblocks must evenly divide size\n"); } if ((NB*NB) != nprocs) { error_exit("numblocks squared must match number of processes\n"); } if (argc > 3) { outfile = makeOutfile(argv[3], myID); if (outfile == NULL) { fprintf(stderr, usage_msg, argv[0]); error_exit(NULL); } } dimN = dimP = dimM = N; dimNb = dimN / NB; dimPb = dimP / NB; dimMb = dimM / NB; myID_i = myID / NB; myID_j = myID % NB; A = malloc(dimNb*dimPb*sizeof(*A)); B = malloc(dimPb*dimMb*sizeof(*B)); C = malloc(dimNb*dimMb*sizeof(*C)); Abuffer = malloc(dimNb*dimPb*sizeof(*Abuffer)); Bbuffer = malloc(dimPb*dimMb*sizeof(*Bbuffer)); if ((A == NULL) || (B == NULL) || (C == NULL) || (Abuffer == NULL) || (Bbuffer == NULL)) { fprintf(stderr, "unable to allocate space for matrices of size %d\n", dimN); return EXIT_FAILURE; } /* Initialize matrices */ initialize(A, B, dimNb, dimPb, dimMb, NB, myID_i, myID_j); /* Do the multiply */ /* could use MPI_Wtime but get_time is what sequential code uses */ start_time = get_time(); matclear(C, dimNb, dimMb, dimMb); for (int kb=0; kb < NB; ++kb) { if (myID_j == kb) { /* send A to other processes in the same "row" */ for (int jb=0; jb < NB; ++jb) { if (jb != myID_j) MPI_Send(A, dimNb*dimPb, MPI_DOUBLE, myID_i*NB + jb, 0, MPI_COMM_WORLD); } /* copy A to Abuffer */ memcpy(Abuffer, A, dimNb*dimPb*sizeof(*A)); } else { /* receive block of A */ MPI_Recv(Abuffer, dimNb*dimPb, MPI_DOUBLE, myID_i*NB + kb, 0, MPI_COMM_WORLD, &status); } if (myID_i == kb) { /* send B to other processes in the same "column" */ for (int ib=0; ib < NB; ++ib) { if (ib != myID_i) MPI_Send(B, dimPb*dimMb, MPI_DOUBLE, ib*NB + myID_j, 0, MPI_COMM_WORLD); } /* copy B to Bbuffer */ memcpy(Bbuffer, B, dimPb*dimMb*sizeof(*A)); } else { /* receive block of A */ MPI_Recv(Bbuffer, dimPb*dimMb, MPI_DOUBLE, kb*NB + myID_j, 0, MPI_COMM_WORLD, &status); } /* compute product of block[ib][kb] of A and block[kb][jb] of B and add to block[ib][jb] of C */ matmul_add( Abuffer, Bbuffer, C, dimNb, dimPb, dimMb, dimPb, dimMb, dimMb ); } end_time = get_time(); /* Print results */ if (outfile != NULL) { printMatrix(outfile, "A", A, dimNb, dimPb, dimPb, myID_i, myID_j); printMatrix(outfile, "B", B, dimPb, dimMb, dimMb, myID_i, myID_j); printMatrix(outfile, "A*B", C, dimNb, dimMb, dimMb, myID_i, myID_j); fclose(outfile); } /* Print timing results */ if (myID == 0) { fprintf(stdout, "Block-based program, parallel with %d processes\n", nprocs); fprintf(stdout, "Size = %d, numblocks = %d, time for multiplication = %g\n", N, NB, end_time - start_time); } /* MPI cleanup */ MPI_Finalize(); return EXIT_SUCCESS; }