PDC
Assignment No.03
Submitted To:
Doctor Qamas Gull
Submitted By:
Aqib Sharafat (21-CS-51)
Section: C
Source Code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>
#include <chrono>
#include <iostream>
#include <math.h>
// CPU matrix multiplication void matrixMultCPU(float *A, float
*B, float *C, int N) {
for (int i = 0; i < N; i++) { for (int j = 0; j < N;
j++) { float sum = 0.0f; for (int k = 0;
k < N; k++) { sum += A[i * N + k] * B[k * N
+ j];
C[i * N + j] = sum;
// CUDA kernel for matrix multiplication
__global__ void matrixMultCUDA(float *A, float *B, float *C, int N) {
// Calculate global row and column indices int row =
blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x
* blockDim.x + threadIdx.x;
// Check if thread is within matrix bounds if (row < N && col < N) { float sum = 0.0f; for (int k =
0; k < N; k++) { sum += A[row * N + k] * B[k * N + col]; // Fixed: B[k * N + col] instead of B[k * N + j]
}
C[row * N + col] = sum;
// Optimized CUDA kernel using shared memory
__global__ void matrixMultCUDAOptimized(float *A, float *B, float *C, int N) { const int TILE_SIZE
= 16; // Size of tile for shared memory
__shared__ float sharedA[TILE_SIZE][TILE_SIZE];
__shared__ float sharedB[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * blockDim.y + threadIdx.y; int col =
blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
// Iterate over tiles for (int t = 0; t < (N + TILE_SIZE - 1) /
TILE_SIZE; t++) { // Load tiles into shared memory if (row
< N && t * TILE_SIZE + threadIdx.x < N)
sharedA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +
threadIdx.x];
else
sharedA[threadIdx.y][threadIdx.x] = 0.0f;
if (col < N && t * TILE_SIZE + threadIdx.y < N) sharedB[threadIdx.y][threadIdx.x] = B[(t *
TILE_SIZE + threadIdx.y) * N + col];
else
sharedB[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads(); // Wait for all threads to finish loading
// Compute partial dot product using current tile for (int k = 0; k <
TILE_SIZE; k++) { sum += sharedA[threadIdx.y][k] *
sharedB[k][threadIdx.x];
__syncthreads(); // Wait for all threads to finish computing
// Store result if (row < N
&& col < N) {
C[row * N + col] = sum;
// Function to initialize matrices with random values void
initializeMatrix(float *matrix, int N) { for (int i = 0; i < N * N; i++)
{ matrix[i] = static_cast<float>(rand()) / RAND_MAX;
// Function to verify results bool verifyResults(float *cpuResults, float *gpuResults, int N)
{ const float epsilon = 1e-5; // Tolerance for floating point comparison for (int i = 0; i
< N * N; i++) { if (fabs(cpuResults[i] - gpuResults[i]) > epsilon) { printf("Error:
Results don't match at index %d. CPU: %f, GPU: %f\n", i, cpuResults[i],
gpuResults[i]); return false;
}
return true;
int main(int argc, char *argv[]) { // Set
random seed srand(time(NULL));
// Matrix sizes to test
int sizes[] = {256, 512, 1024}; int numSizes =
sizeof(sizes) / sizeof(sizes[0]);
printf("Matrix Size, CPU Time (ms), GPU Time (ms), Speedup\n");
for (int s = 0; s < numSizes; s++) { int N =
sizes[s]; size_t matrixSize = N * N * sizeof(float);
// Allocate host memory float *h_A =
(float*)malloc(matrixSize); float *h_B =
(float*)malloc(matrixSize); float *h_C_CPU =
(float*)malloc(matrixSize); float *h_C_GPU =
(float*)malloc(matrixSize);
// Initialize matrices
initializeMatrix(h_A, N);
initializeMatrix(h_B, N);
// Allocate device memory float *d_A,
*d_B, *d_C; cudaMalloc((void**)&d_A,
matrixSize); cudaMalloc((void**)&d_B, matrixSize);
cudaMalloc((void**)&d_C, matrixSize);
// Copy data to device
cudaMemcpy(d_A, h_A, matrixSize, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B,
matrixSize, cudaMemcpyHostToDevice);
// CPU Matrix Multiplication auto cpu_start =
std::chrono::high_resolution_clock::now(); matrixMultCPU(h_A, h_B, h_C_CPU, N);
auto cpu_end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> cpu_time = cpu_end - cpu_start;
// GPU Matrix Multiplication dim3 threadsPerBlock(16, 16); dim3
numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N +
threadsPerBlock.y - 1) / threadsPerBlock.y);
// Record GPU start time cudaEvent_t
start, stop; cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// Launch kernel matrixMultCUDAOptimized<<<numBlocks, threadsPerBlock>>>(d_A, d_B,
d_C, N);
// Record GPU end time cudaEventRecord(stop);
cudaEventSynchronize(stop); float gpu_time_ms = 0;
cudaEventElapsedTime(&gpu_time_ms, start, stop);
// Copy result back to host cudaMemcpy(h_C_GPU, d_C, matrixSize,
cudaMemcpyDeviceToHost);
// Verify results bool resultsMatch = verifyResults(h_C_CPU,
h_C_GPU, N);
// Calculate speedup float speedup = cpu_time.count() /
gpu_time_ms;
// Print results printf("%d, %.2f, %.2f, %.2f\n", N, cpu_time.count(), gpu_time_ms,
speedup);
if (!resultsMatch) { printf("ERROR: Results do
not match!\n");
// Free memory
free(h_A); free(h_B);
free(h_C_CPU); free(h_C_GPU);
cudaFree(d_A); cudaFree(d_B);
cudaFree(d_C);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;