Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
5 views7 pages

Source Code

Uploaded by

Aqib khan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views7 pages

Source Code

Uploaded by

Aqib khan
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

PDC

Assignment No.03

Submitted To:

Doctor Qamas Gull

Submitted By:

Aqib Sharafat (21-CS-51)

Section: C

Source Code:
#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <time.h>

#include <chrono>

#include <iostream>

#include <math.h>

// CPU matrix multiplication void matrixMultCPU(float *A, float

*B, float *C, int N) {

for (int i = 0; i < N; i++) { for (int j = 0; j < N;

j++) { float sum = 0.0f; for (int k = 0;

k < N; k++) { sum += A[i * N + k] * B[k * N

+ j];

C[i * N + j] = sum;

// CUDA kernel for matrix multiplication

__global__ void matrixMultCUDA(float *A, float *B, float *C, int N) {

// Calculate global row and column indices int row =

blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x

* blockDim.x + threadIdx.x;

// Check if thread is within matrix bounds if (row < N && col < N) { float sum = 0.0f; for (int k =

0; k < N; k++) { sum += A[row * N + k] * B[k * N + col]; // Fixed: B[k * N + col] instead of B[k * N + j]

}
C[row * N + col] = sum;

// Optimized CUDA kernel using shared memory

__global__ void matrixMultCUDAOptimized(float *A, float *B, float *C, int N) { const int TILE_SIZE

= 16; // Size of tile for shared memory

__shared__ float sharedA[TILE_SIZE][TILE_SIZE];

__shared__ float sharedB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * blockDim.y + threadIdx.y; int col =

blockIdx.x * blockDim.x + threadIdx.x;

float sum = 0.0f;

// Iterate over tiles for (int t = 0; t < (N + TILE_SIZE - 1) /

TILE_SIZE; t++) { // Load tiles into shared memory if (row

< N && t * TILE_SIZE + threadIdx.x < N)

sharedA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +

threadIdx.x];

else

sharedA[threadIdx.y][threadIdx.x] = 0.0f;

if (col < N && t * TILE_SIZE + threadIdx.y < N) sharedB[threadIdx.y][threadIdx.x] = B[(t *

TILE_SIZE + threadIdx.y) * N + col];

else

sharedB[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads(); // Wait for all threads to finish loading

// Compute partial dot product using current tile for (int k = 0; k <

TILE_SIZE; k++) { sum += sharedA[threadIdx.y][k] *

sharedB[k][threadIdx.x];

__syncthreads(); // Wait for all threads to finish computing

// Store result if (row < N

&& col < N) {

C[row * N + col] = sum;

// Function to initialize matrices with random values void

initializeMatrix(float *matrix, int N) { for (int i = 0; i < N * N; i++)

{ matrix[i] = static_cast<float>(rand()) / RAND_MAX;

// Function to verify results bool verifyResults(float *cpuResults, float *gpuResults, int N)

{ const float epsilon = 1e-5; // Tolerance for floating point comparison for (int i = 0; i

< N * N; i++) { if (fabs(cpuResults[i] - gpuResults[i]) > epsilon) { printf("Error:

Results don't match at index %d. CPU: %f, GPU: %f\n", i, cpuResults[i],

gpuResults[i]); return false;

}
return true;

int main(int argc, char *argv[]) { // Set

random seed srand(time(NULL));

// Matrix sizes to test

int sizes[] = {256, 512, 1024}; int numSizes =

sizeof(sizes) / sizeof(sizes[0]);

printf("Matrix Size, CPU Time (ms), GPU Time (ms), Speedup\n");

for (int s = 0; s < numSizes; s++) { int N =

sizes[s]; size_t matrixSize = N * N * sizeof(float);

// Allocate host memory float *h_A =

(float*)malloc(matrixSize); float *h_B =

(float*)malloc(matrixSize); float *h_C_CPU =

(float*)malloc(matrixSize); float *h_C_GPU =

(float*)malloc(matrixSize);

// Initialize matrices

initializeMatrix(h_A, N);

initializeMatrix(h_B, N);

// Allocate device memory float *d_A,

*d_B, *d_C; cudaMalloc((void**)&d_A,

matrixSize); cudaMalloc((void**)&d_B, matrixSize);

cudaMalloc((void**)&d_C, matrixSize);
// Copy data to device

cudaMemcpy(d_A, h_A, matrixSize, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B,

matrixSize, cudaMemcpyHostToDevice);

// CPU Matrix Multiplication auto cpu_start =

std::chrono::high_resolution_clock::now(); matrixMultCPU(h_A, h_B, h_C_CPU, N);

auto cpu_end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double, std::milli> cpu_time = cpu_end - cpu_start;

// GPU Matrix Multiplication dim3 threadsPerBlock(16, 16); dim3

numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N +

threadsPerBlock.y - 1) / threadsPerBlock.y);

// Record GPU start time cudaEvent_t

start, stop; cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord(start);

// Launch kernel matrixMultCUDAOptimized<<<numBlocks, threadsPerBlock>>>(d_A, d_B,

d_C, N);

// Record GPU end time cudaEventRecord(stop);

cudaEventSynchronize(stop); float gpu_time_ms = 0;

cudaEventElapsedTime(&gpu_time_ms, start, stop);

// Copy result back to host cudaMemcpy(h_C_GPU, d_C, matrixSize,

cudaMemcpyDeviceToHost);

// Verify results bool resultsMatch = verifyResults(h_C_CPU,

h_C_GPU, N);
// Calculate speedup float speedup = cpu_time.count() /

gpu_time_ms;

// Print results printf("%d, %.2f, %.2f, %.2f\n", N, cpu_time.count(), gpu_time_ms,

speedup);

if (!resultsMatch) { printf("ERROR: Results do

not match!\n");

// Free memory

free(h_A); free(h_B);

free(h_C_CPU); free(h_C_GPU);

cudaFree(d_A); cudaFree(d_B);

cudaFree(d_C);

cudaEventDestroy(start);

cudaEventDestroy(stop);

return 0;

You might also like