In: Computer Science
We see that this computes the product of two matrices. Add a new kernel code, called sum, to compute the sum of the two matrices.
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define TILE_WIDTH 2
#define WIDTH 6
// Kernel function execute by the device (GPU)
__global__ void
product (float *d_a, float *d_b, float *d_c, const int n) {
int col = blockIdx.x * blockDim.x + threadIdx.x
;
int row = blockIdx.y * blockDim.y + threadIdx.y
;
float sum = 0;
if (row < n && col < n) {
for (int i = 0 ; i<n ; ++i)
{
sum += d_a[row * n
+ i ] * d_b[i * n + col] ;
}
d_c[row * n + col] = sum;
}
}
// Utility function to print the input matrix
void printMatrix (float m[][WIDTH]) {
int i, j;
for (i = 0; i<WIDTH; ++i) {
for (j = 0; j< WIDTH; ++j)
{
printf ("%d\t",
(int)m[i][j]);
}
printf ("\n");
}
}
// Main function execute by the host (CPU)
int main () {
// host matrices
float host_a[WIDTH][WIDTH],
host_b[WIDTH][WIDTH],
host_c[WIDTH][WIDTH];
// device arrays
float *device_a, *device_b, *device_c;
int i, j;
// initialize host matrices using random numbers
time_t t;
srand ((unsigned) time(&t));
for (i = 0; i<WIDTH; ++i) {
for (j = 0; j<WIDTH; j++) {
host_a[i][j] =
(float) (rand() % 50);
host_b[i][j] =
(float) (rand() % 50);
}
}
printf ("Matrix A:\n");
printMatrix (host_a);
printf ("\n");
printf ("Matrix B:\n");
printMatrix (host_b);
printf ("\n");
// allocate device memory for input matrices
size_t deviceSize = WIDTH * WIDTH * sizeof
(float);
cudaMalloc ((void **) &device_a,
deviceSize);
cudaMalloc ((void **) &device_b,
deviceSize);
// copy host matrices to device
cudaMemcpy (device_a, host_a, deviceSize,
cudaMemcpyHostToDevice );
cudaMemcpy (device_b, host_b, deviceSize,
cudaMemcpyHostToDevice );
// allocate device memory to store computed
result
cudaMalloc((void **) &device_c, deviceSize)
;
dim3 dimBlock (WIDTH, WIDTH);
dim3 dimGrid (WIDTH/TILE_WIDTH,
WIDTH/TILE_WIDTH);
product<<<dimGrid, dimBlock>>>
(device_a, device_b, device_c, WIDTH);
// copy result from device back to host
cudaMemcpy (host_c, device_c, deviceSize,
cudaMemcpyDeviceToHost);
// output the computed result matrix
printf ("A x B: \n");
printMatrix (host_c);
cudaFree (device_a);
cudaFree (device_b);
cudaFree (device_c);
return 0;
}
// Utility function to sum of the input matrix
void sum (float m[WIDTH][WIDTH], float n[WIDTH][WIDTH] ){
int i, j;
float sum[width][width];
for (i = 0; i<WIDTH; ++i) {
for (j = 0; j< WIDTH; ++j)
{
sum[i][j] = a[i][j] + b[i][j]
}
}
printMatrix (sum[WIDTH][WIDTH]);
}
The explanation goes as the sum of the Matrixx can be calculated by adding their respective positions of both the Matrix. By respective positions I mean the first place of first Matrixx will be added to the first place of second Matrixx for example if a Matrix od size 3, D2, two position will be added to the two, two position of the second Matrix. After adding both the Matrixx the new some will be stored in a new Matrixx named as Sahab and similarly all the elements will be added after getting the new Matrixx named as Sum will be printing the Matrix with the help of the predefined function that is PrintMatrix that has already been defined into the example.