Question

In: Computer Science

OpenACC. Insert OpenACC directives to improve the performance only within the matmul function. Enhance the comments...

OpenACC.

Insert OpenACC directives to improve the performance only within the matmul function. Enhance the comments throughout.

Clearly identify, in your report, the execution time of your implementation the algorithm. How large do the matrices need to be before the performance of a P100 exceeds that of 28 cores on Bridges (using square matrices with power of two order)?

///////////////////////////////////////////////////////////////////////////////
// matmul.c
//
// Procedures:
//
// main   generates matrices and tests matmul
// matmul   basic, brute force matrix multiply
///////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <sys/time.h>

///////////////////////////////////////////////////////////////////////////////
// int main( int argc, char *argv[] )
// Description: Generates two matrices and then calls matmul to multiply them.
//    Finally, it verifies that the results are correct.
//
// Parameters:
//   argc   I/P   int   The number of arguments on the command line
//   argv   I/P   char *[]   The arguments on the command line
//   main   O/P   int   Status code
///////////////////////////////////////////////////////////////////////////////
#ifndef L
#define L (1*1024/1)
#endif
#ifndef M
#ifdef SQUARE
#define M L
#else
#define M (1*1024/1)
#endif
#endif
#ifndef N
#ifdef SQUARE
#define N L
#else
#define N (1*1024/1)
#endif
#endif
float A[L*M], B[M*N], C[L*N];

int matmul( int l, int m, int n, float *A, float *B, float *C );

int main( int argc, char *argv[] )
{
   int i, j, k;

#ifdef OMP
#pragma omp parallel
{
   int np = omp_get_num_procs();
   fprintf( stderr, "Threads = %d\n", np );
}
#endif

   for( i=0; i<L; i++ )
   for( j=0; j<M; j++ )
   {
   if( i <= j )
   {
       A[i*M+j] = (float) (i*M+j+1);
   }
   else
   {
       A[i*M+j] = 0.0;
       A[i*M+j] = (float) (i*M+j+1);
   }
   }

   for( j=0; j<M; j++ )
   for( k=0; k<N; k++ )
   {
   if( j <= k )
   {
   if( k < M )
       B[j*N+k] = 1.0;
   else
       B[j*N+k] = B[j*N+k-1] + 1.0;
   }
   else
   {
       B[j*N+k] = 0.0;
   }
   }

   for( i=0; i<L; i++ )
   for( k=0; k<N; k++ )
   {
   C[i*N+k] = - (float) L*M*N;
   }

   struct timeval start, stop;
   gettimeofday( &start, NULL );
   matmul( L, M, N, A, B, C );
   gettimeofday( &stop, NULL );
   float elapsed = ( (stop.tv_sec-start.tv_sec) +
           (stop.tv_usec-start.tv_usec)/(float)1000000 );

   float flops = ( 2 * (float)L * (float)M * (float)N ) / elapsed;

   printf( "L=%d, M=%d, N=%d, elapsed=%g, flops=%g\n",
       L, M, N, elapsed, flops );

#ifdef DEBUG
   printf( "A:\n" );
   for( i=0; i<L; i++ )
   {
   printf( "%g", A[i*M] );
   for( j=1; j<M; j++ )
   {
   printf( " %g", A[i*M+j] );
   }
   printf( "\n" );
   }

   printf( "B:\n" );
   for( j=0; j<M; j++ )
   {
   printf( "%g", B[j*N] );
   for( k=1; k<N; k++ )
   {
   printf( " %g", B[j*N+k] );
   }
   printf( "\n" );
   }

   printf( "C:\n" );
   for( i=0; i<L; i++ )
   {
   printf( "%g", C[i*N] );
   for( k=1; k<N; k++ )
   {
   printf( " %g", C[i*N+k] );
   }
   printf( "\n" );
   }
#endif
}

///////////////////////////////////////////////////////////////////////////////
// int main( int argc, char *argv[] )
// Description: Generates two matrices and then calls matmul to multiply them.
//    Finally, it verifies that the results are correct.
//
// Parameters:
//   l   I/P   int   The first dimension of A and C
//   m   I/P   int   The second dimension of A and first of B
//   n   I/P   int   The second dimension of B and C
//   A   I/P   float *   The first input matrix
//   B   I/P   float *   The second input matrix
//   C   O/P   float *   The output matrix
//   matmul   O/P   int   Status code
///////////////////////////////////////////////////////////////////////////////
int matmul( int l, int m, int n, float *restrict A, float *restrict B, float *restrict C )
{
   int i, j, k;

   for( i=0; i<l; i++ )               // Loop over the rows of A and C.
   for( k=0; k<n; k++ )               // Loop over the columns of B and C
   {
   // Initialize the output element for the inner
   // product of row i of A with column j of B
   C[i*n+k] = 0;                      
   for( j=0; j<m; j++ )               // Loop over the columns of A and C
   {
       C[i*n+k] += A[i*m+j] * B[j*n+k];   // Compute the inner product
   }
   }
}

Solutions

Expert Solution

matmul.c :

#include <omp.h>

#include <stdlib.h>

#include <sys/time.h>

#include <time.h>

#include <stdio.h>

#define random() ((double) rand() / (RAND_MAX))

void printMat(double** mati, int size) {

int i,j;

for (i = 0; i < size; i++) {   

for (j = 0; j < size; j++) {

printf("%lf ", mati[i][j]);

}

printf("\n");

}

}

void printVec(double* vec, int size) {

int i;

for (i = 0; i < size; i++) {

printf("%lf ", vec[i]);

}

printf("\n");

}

int size, thread_num;

int main(int argc, char* argv[])

{

size = atoi(argv[2]);

thread_num = atoi(argv[1]);

omp_set_num_threads(thread_num);

srand(time(NULL));

double** a = (double*) malloc(size * sizeof(double));

double** b = (double*) malloc(size * sizeof(double));

double** c = (double*) malloc(size * sizeof(double));

int i, j, k;

for (i = 0; i < size; i++) {

a[i] = (double*) malloc(size * sizeof(double));

b[i] = (double*) malloc(size * sizeof(double));

c[i] = (double*) malloc(size * sizeof(double));

}

for (i = 0; i < size; ++i) {

for (j = 0; j < size; ++j) {

a[i][j] = 1.0; // random();

b[i][j] = 1.0; //random();

c[i][j] = 0.0;

}

}

struct timeval start, end;

gettimeofday(&start, NULL);

#pragma omp parallel for shared(a,b,c) private(i, j, k)

for (i = 0; i < size; ++i) {

for (j = 0; j < size; ++j) {

for (k = 0; k < size; ++k) {

c[i][j] += a[i][k] * b[k][j];

}

}

}

  

gettimeofday(&end, NULL);

double delta = ((end.tv_sec - start.tv_sec) * 1000000u +

end.tv_usec - start.tv_usec) / 1.e6;

printf("%d %d %lf\n", thread_num, size, delta);

printMat(a, size);

printf("---------------------------------\n");

printMat(b, size);

printf("---------------------------------\n");

printMat(c, size);

printf("---------------------------------\n");

return 0;

}


Related Solutions

Your task is to take the below code, and insert comments (using the “%” symbol) next...
Your task is to take the below code, and insert comments (using the “%” symbol) next to each line of code to make sure that you know what every line does. clc clear close all NMax = 100; partialSum = 0; exactAnswer = pi^2; for k=1:NMax partialSum = partialSum + 6/k^2; percentDiff(k) = abs(partialSum - exactAnswer)/exactAnswer*100; end NVector = [1:NMax]; plot(NVector,percentDiff); xlabel('{{Noob}}'); ylabel('% Difference');
Your task is to take the above code, and insert comments (using the “%” symbol) next...
Your task is to take the above code, and insert comments (using the “%” symbol) next to each line of code to make sure that you know what every line does. close all; clear all; clc; thetaAB = 0; angleIncrement = 0.1; numOfLoops = 360/angleIncrement; thetaABVector = [angleIncrement:angleIncrement:360]; rAB = 5; rBC = 8; for i=1:numOfLoops bothResults = SliderCrankPosn(rAB,rBC,thetaAB); rAC(i) = bothResults(1); thetaBC(i) = bothResults(2); thetaAB = thetaAB+angleIncrement; end subplot(2,1,1) plot(thetaABVector,thetaBC,'Linewidth',3); xlabel('\theta_{AB} [degrees]'); ylabel('\theta_{BC} [degrees]'); xlim([0 360]); ylim([300 400]); grid...
We can build a heap by repeatedly calling the insert function to insert the elements into...
We can build a heap by repeatedly calling the insert function to insert the elements into the heap. Here is pseudocode: buildHeap(A) h = new empty heap   for each element e in A       h.insert(e)             What is the Big-O runtime of this version of buildHeap? Justify your answer.
Write in C++ and insert comments. Input two values a and b 0 may be used...
Write in C++ and insert comments. Input two values a and b 0 may be used for False 1 may be used for True Write a program that will print the truth table for the logical operator and give the results for the specific input a, b. The following logical operators should be coded. ● conjunction (AND) ● disjunction (OR) ● conditional Statement (If a then b) ● exclusive OR (XOR) ● biconditional operation (p iff q) Example for conjunction...
Please only edit the list.cpp file only, implement the push_front method that will insert a new...
Please only edit the list.cpp file only, implement the push_front method that will insert a new element to the front of the list. //list.h // Doubly linked list #ifndef Q2_H #define Q2_H template<typename T> class List; template<typename T> class Iterator; template <typename T> class Node {    public:        Node(T element);    private:        T data;        Node* previous;        Node* next;    friend class List<T>;    friend class Iterator<T>; }; template <typename T> class List...
Recommend various staffing technologies to enhance the performance and efficiency of the staffing system.
Recommend various staffing technologies to enhance the performance and efficiency of the staffing system.
Apply some of the major approaches to motivation to enhance performance and productivity to a given...
Apply some of the major approaches to motivation to enhance performance and productivity to a given situation in a company. (low motivation and job satisfaction, increased absenteeism and tardiness, poor work performance and service, elevated stress, conflict among employees can be addressed in your answer)
The catch basin insert is a device for retrofitting catch basins to improve pollutant removal properties....
The catch basin insert is a device for retrofitting catch basins to improve pollutant removal properties. Consider the following data for one particular type of insert on x- amount filtered (1000s of liters) and y = % total suspended solids removed. Table 1 x: 23, 45, 68, 91, 114, 205, 228 y: 53, 27, 55, 34, 30, 3, 11 a. Test for a significant relationship using the F test. What is your conclusion? Use α=.05. b. Show the ANOVA table...
The catch basin insert is a device for retrofitting catch basins to improve pollutant removal properties....
The catch basin insert is a device for retrofitting catch basins to improve pollutant removal properties. Consider the following data for one particular type of insert on x- amount filtered (1000s of liters) and y = % total suspended solids removed. Table 1 a. Test for a significant relationship using the F test. What is your conclusion? Use α=.05. b. Show the ANOVA table for these data. c. Compute the coefficient of determination. Comment on the goodness of fit. x...
Identify one element of either the CPOE or CDSS you would improve that could enhance the...
Identify one element of either the CPOE or CDSS you would improve that could enhance the effectiveness of the system for that elderly patient population
ADVERTISEMENT
ADVERTISEMENT
ADVERTISEMENT