In: Computer Science
OpenACC.
Insert OpenACC directives to improve the performance only within the matmul function. Enhance the comments throughout.
Clearly identify, in your report, the execution time of your implementation the algorithm. How large do the matrices need to be before the performance of a P100 exceeds that of 28 cores on Bridges (using square matrices with power of two order)?
///////////////////////////////////////////////////////////////////////////////
// matmul.c
//
// Procedures:
//
// main generates matrices and tests matmul
// matmul basic, brute force matrix multiply
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <sys/time.h>
///////////////////////////////////////////////////////////////////////////////
// int main( int argc, char *argv[] )
// Description: Generates two matrices and then calls matmul to
multiply them.
// Finally, it verifies that the results are
correct.
//
// Parameters:
// argc I/P int The
number of arguments on the command line
// argv I/P char
*[] The arguments on the command line
// main O/P int
Status code
///////////////////////////////////////////////////////////////////////////////
#ifndef L
#define L (1*1024/1)
#endif
#ifndef M
#ifdef SQUARE
#define M L
#else
#define M (1*1024/1)
#endif
#endif
#ifndef N
#ifdef SQUARE
#define N L
#else
#define N (1*1024/1)
#endif
#endif
float A[L*M], B[M*N], C[L*N];
int matmul( int l, int m, int n, float *A, float *B, float *C );
int main( int argc, char *argv[] )
{
int i, j, k;
#ifdef OMP
#pragma omp parallel
{
int np = omp_get_num_procs();
fprintf( stderr, "Threads = %d\n", np );
}
#endif
for( i=0; i<L; i++ )
for( j=0; j<M; j++ )
{
if( i <= j )
{
A[i*M+j] = (float) (i*M+j+1);
}
else
{
A[i*M+j] = 0.0;
A[i*M+j] = (float) (i*M+j+1);
}
}
for( j=0; j<M; j++ )
for( k=0; k<N; k++ )
{
if( j <= k )
{
if( k < M )
B[j*N+k] = 1.0;
else
B[j*N+k] = B[j*N+k-1] + 1.0;
}
else
{
B[j*N+k] = 0.0;
}
}
for( i=0; i<L; i++ )
for( k=0; k<N; k++ )
{
C[i*N+k] = - (float) L*M*N;
}
struct timeval start, stop;
gettimeofday( &start, NULL );
matmul( L, M, N, A, B, C );
gettimeofday( &stop, NULL );
float elapsed = ( (stop.tv_sec-start.tv_sec) +
(stop.tv_usec-start.tv_usec)/(float)1000000 );
float flops = ( 2 * (float)L * (float)M * (float)N ) / elapsed;
printf( "L=%d, M=%d, N=%d, elapsed=%g,
flops=%g\n",
L, M, N, elapsed, flops );
#ifdef DEBUG
printf( "A:\n" );
for( i=0; i<L; i++ )
{
printf( "%g", A[i*M] );
for( j=1; j<M; j++ )
{
printf( " %g", A[i*M+j] );
}
printf( "\n" );
}
printf( "B:\n" );
for( j=0; j<M; j++ )
{
printf( "%g", B[j*N] );
for( k=1; k<N; k++ )
{
printf( " %g", B[j*N+k] );
}
printf( "\n" );
}
printf( "C:\n" );
for( i=0; i<L; i++ )
{
printf( "%g", C[i*N] );
for( k=1; k<N; k++ )
{
printf( " %g", C[i*N+k] );
}
printf( "\n" );
}
#endif
}
///////////////////////////////////////////////////////////////////////////////
// int main( int argc, char *argv[] )
// Description: Generates two matrices and then calls matmul to
multiply them.
// Finally, it verifies that the results are
correct.
//
// Parameters:
// l I/P int The
first dimension of A and C
// m I/P int The
second dimension of A and first of B
// n I/P int The
second dimension of B and C
// A I/P float *
The first input matrix
// B I/P float *
The second input matrix
// C O/P float *
The output matrix
// matmul O/P int
Status code
///////////////////////////////////////////////////////////////////////////////
int matmul( int l, int m, int n, float *restrict A, float *restrict
B, float *restrict C )
{
int i, j, k;
for( i=0; i<l; i++ )
// Loop
over the rows of A and C.
for( k=0; k<n; k++ )
// Loop over the columns of B
and C
{
// Initialize the output element for the inner
// product of row i of A with column j of B
C[i*n+k] = 0;
for( j=0; j<m; j++ )
// Loop over the columns of A
and C
{
C[i*n+k] += A[i*m+j] *
B[j*n+k]; // Compute the inner product
}
}
}
matmul.c :
#include <omp.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <stdio.h>
#define random() ((double) rand() / (RAND_MAX))
void printMat(double** mati, int size) {
int i,j;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
printf("%lf ", mati[i][j]);
}
printf("\n");
}
}
void printVec(double* vec, int size) {
int i;
for (i = 0; i < size; i++) {
printf("%lf ", vec[i]);
}
printf("\n");
}
int size, thread_num;
int main(int argc, char* argv[])
{
size = atoi(argv[2]);
thread_num = atoi(argv[1]);
omp_set_num_threads(thread_num);
srand(time(NULL));
double** a = (double*) malloc(size * sizeof(double));
double** b = (double*) malloc(size * sizeof(double));
double** c = (double*) malloc(size * sizeof(double));
int i, j, k;
for (i = 0; i < size; i++) {
a[i] = (double*) malloc(size * sizeof(double));
b[i] = (double*) malloc(size * sizeof(double));
c[i] = (double*) malloc(size * sizeof(double));
}
for (i = 0; i < size; ++i) {
for (j = 0; j < size; ++j) {
a[i][j] = 1.0; // random();
b[i][j] = 1.0; //random();
c[i][j] = 0.0;
}
}
struct timeval start, end;
gettimeofday(&start, NULL);
#pragma omp parallel for shared(a,b,c) private(i, j, k)
for (i = 0; i < size; ++i) {
for (j = 0; j < size; ++j) {
for (k = 0; k < size; ++k) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
gettimeofday(&end, NULL);
double delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
printf("%d %d %lf\n", thread_num, size, delta);
printMat(a, size);
printf("---------------------------------\n");
printMat(b, size);
printf("---------------------------------\n");
printMat(c, size);
printf("---------------------------------\n");
return 0;
}

