0% found this document useful (0 votes)

5 views7 pages

Source Code

Uploaded by

Aqib khan

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

5 views7 pages

Source Code

Uploaded by

Aqib khan

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 7

PDC

Assignment No.03

Submitted To:

Doctor Qamas Gull

Submitted By:

Aqib Sharafat (21-CS-51)

Section: C

Source Code:
#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <time.h>

#include <chrono>

#include <iostream>

#include <math.h>

// CPU matrix multiplication void matrixMultCPU(float *A, float

B, float C, int N) {

for (int i = 0; i < N; i++) { for (int j = 0; j < N;

j++) { float sum = 0.0f; for (int k = 0;

k < N; k++) { sum += A[i * N + k] * B[k * N

+ j];

C[i * N + j] = sum;

// CUDA kernel for matrix multiplication

global void matrixMultCUDA(float A, float B, float *C, int N) {

// Calculate global row and column indices int row =

blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x

* blockDim.x + threadIdx.x;

// Check if thread is within matrix bounds if (row < N && col < N) { float sum = 0.0f; for (int k =

0; k < N; k++) { sum += A[row * N + k] * B[k * N + col]; // Fixed: B[k * N + col] instead of B[k * N + j]

}
C[row * N + col] = sum;

// Optimized CUDA kernel using shared memory

__global__ void matrixMultCUDAOptimized(float *A, float *B, float *C, int N) { const int TILE_SIZE

= 16; // Size of tile for shared memory

shared float sharedA[TILE_SIZE][TILE_SIZE];

shared float sharedB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * blockDim.y + threadIdx.y; int col =

blockIdx.x * blockDim.x + threadIdx.x;

float sum = 0.0f;

// Iterate over tiles for (int t = 0; t < (N + TILE_SIZE - 1) /

TILE_SIZE; t++) { // Load tiles into shared memory if (row

< N && t * TILE_SIZE + threadIdx.x < N)

sharedA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +

threadIdx.x];

else

sharedA[threadIdx.y][threadIdx.x] = 0.0f;

if (col < N && t * TILE_SIZE + threadIdx.y < N) sharedB[threadIdx.y][threadIdx.x] = B[(t *

TILE_SIZE + threadIdx.y) * N + col];

else

sharedB[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads(); // Wait for all threads to finish loading

// Compute partial dot product using current tile for (int k = 0; k <

TILE_SIZE; k++) { sum += sharedA[threadIdx.y][k] *

sharedB[k][threadIdx.x];

__syncthreads(); // Wait for all threads to finish computing

// Store result if (row < N

&& col < N) {

C[row * N + col] = sum;

// Function to initialize matrices with random values void

initializeMatrix(float matrix, int N) { for (int i = 0; i < N N; i++)

{ matrix[i] = static_cast<float>(rand()) / RAND_MAX;

// Function to verify results bool verifyResults(float cpuResults, float gpuResults, int N)

{ const float epsilon = 1e-5; // Tolerance for floating point comparison for (int i = 0; i

< N * N; i++) { if (fabs(cpuResults[i] - gpuResults[i]) > epsilon) { printf("Error:

Results don't match at index %d. CPU: %f, GPU: %f\n", i, cpuResults[i],

gpuResults[i]); return false;

}
return true;

int main(int argc, char *argv[]) { // Set

random seed srand(time(NULL));

// Matrix sizes to test

int sizes[] = {256, 512, 1024}; int numSizes =

sizeof(sizes) / sizeof(sizes[0]);

printf("Matrix Size, CPU Time (ms), GPU Time (ms), Speedup\n");

for (int s = 0; s < numSizes; s++) { int N =

sizes[s]; size_t matrixSize = N * N * sizeof(float);

// Allocate host memory float *h_A =

(float)malloc(matrixSize); float h_B =

(float)malloc(matrixSize); float h_C_CPU =

(float)malloc(matrixSize); float h_C_GPU =

(float*)malloc(matrixSize);

// Initialize matrices

initializeMatrix(h_A, N);

initializeMatrix(h_B, N);

// Allocate device memory float *d_A,

d_B, d_C; cudaMalloc((void**)&d_A,

matrixSize); cudaMalloc((void**)&d_B, matrixSize);

cudaMalloc((void**)&d_C, matrixSize);
// Copy data to device

cudaMemcpy(d_A, h_A, matrixSize, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B,

matrixSize, cudaMemcpyHostToDevice);

// CPU Matrix Multiplication auto cpu_start =

std::chrono::high_resolution_clock::now(); matrixMultCPU(h_A, h_B, h_C_CPU, N);

auto cpu_end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double, std::milli> cpu_time = cpu_end - cpu_start;

// GPU Matrix Multiplication dim3 threadsPerBlock(16, 16); dim3

numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N +

threadsPerBlock.y - 1) / threadsPerBlock.y);

// Record GPU start time cudaEvent_t

start, stop; cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaEventRecord(start);

// Launch kernel matrixMultCUDAOptimized<<<numBlocks, threadsPerBlock>>>(d_A, d_B,

d_C, N);

// Record GPU end time cudaEventRecord(stop);

cudaEventSynchronize(stop); float gpu_time_ms = 0;

cudaEventElapsedTime(&gpu_time_ms, start, stop);

// Copy result back to host cudaMemcpy(h_C_GPU, d_C, matrixSize,

cudaMemcpyDeviceToHost);

// Verify results bool resultsMatch = verifyResults(h_C_CPU,

h_C_GPU, N);
// Calculate speedup float speedup = cpu_time.count() /

gpu_time_ms;

// Print results printf("%d, %.2f, %.2f, %.2f\n", N, cpu_time.count(), gpu_time_ms,

speedup);

if (!resultsMatch) { printf("ERROR: Results do

not match!\n");

// Free memory

free(h_A); free(h_B);

free(h_C_CPU); free(h_C_GPU);

cudaFree(d_A); cudaFree(d_B);

cudaFree(d_C);

cudaEventDestroy(start);

cudaEventDestroy(stop);

return 0;

Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
CUDA Matrix Multiplication Quiz
No ratings yet
CUDA Matrix Multiplication Quiz
12 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Cuda
No ratings yet
Cuda
7 pages
GPU History & CUDA Programming Basics
No ratings yet
GPU History & CUDA Programming Basics
44 pages
Rishi
No ratings yet
Rishi
30 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Matrix Mult
100% (1)
Matrix Mult
55 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
No ratings yet
Ejercicio 2 Práctica 3: CUDA Desempeño en Función de La Homogeneidad para Acceder A Memoria y de La Regularidad Del Código
8 pages
G80 Cuda
No ratings yet
G80 Cuda
25 pages
My Experiments: Opencl Gpu Matrix Multiplication Program
No ratings yet
My Experiments: Opencl Gpu Matrix Multiplication Program
19 pages
UNIT-2 Array
No ratings yet
UNIT-2 Array
21 pages
Threads
No ratings yet
Threads
54 pages
MCA Part-I - Part-II - Part-III
No ratings yet
MCA Part-I - Part-II - Part-III
23 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
2023 CSC14120 Lecture01 CUDAIntroduction
No ratings yet
2023 CSC14120 Lecture01 CUDAIntroduction
32 pages
CUDA
No ratings yet
CUDA
3 pages
Intro to CUDA Programming Guide
No ratings yet
Intro to CUDA Programming Guide
33 pages
Group A Assignment 4 (A) : Two Large Vectors
No ratings yet
Group A Assignment 4 (A) : Two Large Vectors
5 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
DeviceFunc Cu
100% (1)
DeviceFunc Cu
1 page
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
PDC Assignment
No ratings yet
PDC Assignment
9 pages
Moving To Parallel - Addition of 2 Matrices
No ratings yet
Moving To Parallel - Addition of 2 Matrices
14 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
CUDA Programming for Developers
No ratings yet
CUDA Programming for Developers
42 pages
HPC 4 B
No ratings yet
HPC 4 B
5 pages
Lab7 GPU
No ratings yet
Lab7 GPU
10 pages
C For Engineers and Scientist
89% (9)
C For Engineers and Scientist
664 pages
L06 GPGPU CUDA Programming 1
No ratings yet
L06 GPGPU CUDA Programming 1
23 pages
PC Cuda Assignment-2
No ratings yet
PC Cuda Assignment-2
29 pages
CS241 Notes
100% (1)
CS241 Notes
100 pages
Parallel Computing Lab4
No ratings yet
Parallel Computing Lab4
13 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Cuuda Nvidai Guide - Part3
No ratings yet
Cuuda Nvidai Guide - Part3
15 pages
作业2
No ratings yet
作业2
5 pages
Only Implementing Custom Logic in BADI
No ratings yet
Only Implementing Custom Logic in BADI
14 pages
CUDA MatrixMultiplication
No ratings yet
CUDA MatrixMultiplication
2 pages
Cuda
No ratings yet
Cuda
4 pages
Cuda Add Mult
No ratings yet
Cuda Add Mult
3 pages
User Manual For QR Code Generation System
No ratings yet
User Manual For QR Code Generation System
14 pages
Cuda Mode Lecture2
No ratings yet
Cuda Mode Lecture2
33 pages
Tarkeshwar Barua - Mobile Applications Development - With Python in Kivy Framework
No ratings yet
Tarkeshwar Barua - Mobile Applications Development - With Python in Kivy Framework
362 pages
Addition Cuda
No ratings yet
Addition Cuda
2 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
CUDA Additionof2Vector
No ratings yet
CUDA Additionof2Vector
2 pages
CUDA Class Lecture03
No ratings yet
CUDA Class Lecture03
18 pages
5 Computation
No ratings yet
5 Computation
13 pages
Pradeep Jha
No ratings yet
Pradeep Jha
3 pages
CUDA Class Lecture02
No ratings yet
CUDA Class Lecture02
24 pages
Assignment 04
No ratings yet
Assignment 04
16 pages
Parallel and Distributed Systems: Sivapuram Venkata Harshini 226003124
No ratings yet
Parallel and Distributed Systems: Sivapuram Venkata Harshini 226003124
33 pages
HPC File
No ratings yet
HPC File
22 pages
Cuda4 2
No ratings yet
Cuda4 2
4 pages
(CompreQuest Series) Theophilus Edet - Procedural Programming - Unleashing Algorithmic Power-Independently Published (2024)
No ratings yet
(CompreQuest Series) Theophilus Edet - Procedural Programming - Unleashing Algorithmic Power-Independently Published (2024)
328 pages
How To Install and Use Curl On Windows
No ratings yet
How To Install and Use Curl On Windows
42 pages
Cuda 4.1
No ratings yet
Cuda 4.1
2 pages
Hospital Management System Guide
No ratings yet
Hospital Management System Guide
54 pages
CUDA Class Lecture04
No ratings yet
CUDA Class Lecture04
11 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
Programming and Problem Solving Using C' Lecture Notes by N S Kumar
No ratings yet
Programming and Problem Solving Using C' Lecture Notes by N S Kumar
6 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Unit 05-App Development
No ratings yet
Unit 05-App Development
15 pages
Ideal Entrepreneur
No ratings yet
Ideal Entrepreneur
23 pages
Secure Networks (Firewall)
No ratings yet
Secure Networks (Firewall)
33 pages
p4 Multiply
No ratings yet
p4 Multiply
2 pages
Unity Burst Compilation Settings
No ratings yet
Unity Burst Compilation Settings
13 pages
Clone Project
No ratings yet
Clone Project
12 pages
Lecture 9
No ratings yet
Lecture 9
31 pages
AlexeyPechnikov (Alexey Pechnikov)
No ratings yet
AlexeyPechnikov (Alexey Pechnikov)
2 pages
PDC Lecture 12
No ratings yet
PDC Lecture 12
42 pages
Data Parallelism, Task Parallelism, CPU, GPU
No ratings yet
Data Parallelism, Task Parallelism, CPU, GPU
13 pages
Data Parallelism, Task Parallelism, CPU, GPU
No ratings yet
Data Parallelism, Task Parallelism, CPU, GPU
13 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
TransectionManagement Part1
No ratings yet
TransectionManagement Part1
29 pages
16-Bit Language Tools Libraries 51456F
No ratings yet
16-Bit Language Tools Libraries 51456F
248 pages
ISTN2IP Student Guide 2024V1
No ratings yet
ISTN2IP Student Guide 2024V1
6 pages
Minesweeper in Java
No ratings yet
Minesweeper in Java
10 pages
Day 8
No ratings yet
Day 8
7 pages
Accountability and Auditing
No ratings yet
Accountability and Auditing
15 pages
Application Development (E &F Harish Sir)
No ratings yet
Application Development (E &F Harish Sir)
6 pages
The Adoption of Component-Based Architecture in The Development of E-Learning Website Interface
No ratings yet
The Adoption of Component-Based Architecture in The Development of E-Learning Website Interface
6 pages
R2023-Sem V New
No ratings yet
R2023-Sem V New
3 pages
This Assignment Has Three Parts. Part One
No ratings yet
This Assignment Has Three Parts. Part One
7 pages
Excel 365 Lab Evaluation Sample
No ratings yet
Excel 365 Lab Evaluation Sample
6 pages
MapReduce Matrix Multiplication
No ratings yet
MapReduce Matrix Multiplication
5 pages
Trade-Off Between Recursion and Iteration
No ratings yet
Trade-Off Between Recursion and Iteration
2 pages
Lecturer Preparation Notes
No ratings yet
Lecturer Preparation Notes
3 pages
Assignment 4 (21-cs-51 & 21-cs-98)
No ratings yet
Assignment 4 (21-cs-51 & 21-cs-98)
8 pages
ReportTask 1
No ratings yet
ReportTask 1
3 pages
Visual Parallel Programming Guide
100% (1)
Visual Parallel Programming Guide
10 pages
Aman Anand: Contact Information
0% (1)
Aman Anand: Contact Information
2 pages
Muhammad Sameer Ahmed 23-CS-56 Assignment 2
No ratings yet
Muhammad Sameer Ahmed 23-CS-56 Assignment 2
3 pages
Feasibility Analysis
No ratings yet
Feasibility Analysis
2 pages
App File
No ratings yet
App File
1 page
Assignment 3
No ratings yet
Assignment 3
1 page
PDC-Assignment 03 1
No ratings yet
PDC-Assignment 03 1
1 page
iDRAC Setup Guide for Sys Admins
No ratings yet
iDRAC Setup Guide for Sys Admins
11 pages

Source Code

Uploaded by

Source Code

Uploaded by

PDC

Doctor Qamas Gull

Aqib Sharafat (21-CS-51)

// CPU matrix multiplication void matrixMultCPU(float *A, float

*B, float *C, int N) {

for (int i = 0; i < N; i++) { for (int j = 0; j < N;

j++) { float sum = 0.0f; for (int k = 0;

k < N; k++) { sum += A[i * N + k] * B[k * N

// CUDA kernel for matrix multiplication

__global__ void matrixMultCUDA(float *A, float *B, float *C, int N) {

// Calculate global row and column indices int row =

blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x

// Optimized CUDA kernel using shared memory

= 16; // Size of tile for shared memory

__shared__ float sharedA[TILE_SIZE][TILE_SIZE];

__shared__ float sharedB[TILE_SIZE][TILE_SIZE];

int row = blockIdx.y * blockDim.y + threadIdx.y; int col =

blockIdx.x * blockDim.x + threadIdx.x;

float sum = 0.0f;

// Iterate over tiles for (int t = 0; t < (N + TILE_SIZE - 1) /

TILE_SIZE; t++) { // Load tiles into shared memory if (row

< N && t * TILE_SIZE + threadIdx.x < N)

sharedA[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_SIZE +

if (col < N && t * TILE_SIZE + threadIdx.y < N) sharedB[threadIdx.y][threadIdx.x] = B[(t *

TILE_SIZE + threadIdx.y) * N + col];

TILE_SIZE; k++) { sum += sharedA[threadIdx.y][k] *

__syncthreads(); // Wait for all threads to finish computing

// Store result if (row < N

&& col < N) {

C[row * N + col] = sum;

// Function to initialize matrices with random values void

initializeMatrix(float *matrix, int N) { for (int i = 0; i < N * N; i++)

{ matrix[i] = static_cast<float>(rand()) / RAND_MAX;

// Function to verify results bool verifyResults(float *cpuResults, float *gpuResults, int N)

< N * N; i++) { if (fabs(cpuResults[i] - gpuResults[i]) > epsilon) { printf("Error:

gpuResults[i]); return false;

int main(int argc, char *argv[]) { // Set

random seed srand(time(NULL));

// Matrix sizes to test

int sizes[] = {256, 512, 1024}; int numSizes =

printf("Matrix Size, CPU Time (ms), GPU Time (ms), Speedup\n");

for (int s = 0; s < numSizes; s++) { int N =

sizes[s]; size_t matrixSize = N * N * sizeof(float);

// Allocate host memory float *h_A =

(float*)malloc(matrixSize); float *h_B =

(float*)malloc(matrixSize); float *h_C_CPU =

(float*)malloc(matrixSize); float *h_C_GPU =

// Allocate device memory float *d_A,

*d_B, *d_C; cudaMalloc((void**)&d_A,

matrixSize); cudaMalloc((void**)&d_B, matrixSize);

cudaMemcpy(d_A, h_A, matrixSize, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B,

// CPU Matrix Multiplication auto cpu_start =

std::chrono::high_resolution_clock::now(); matrixMultCPU(h_A, h_B, h_C_CPU, N);

auto cpu_end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double, std::milli> cpu_time = cpu_end - cpu_start;

// GPU Matrix Multiplication dim3 threadsPerBlock(16, 16); dim3

numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N +

// Record GPU start time cudaEvent_t

start, stop; cudaEventCreate(&start);

// Launch kernel matrixMultCUDAOptimized<<<numBlocks, threadsPerBlock>>>(d_A, d_B,

// Record GPU end time cudaEventRecord(stop);

cudaEventSynchronize(stop); float gpu_time_ms = 0;

cudaEventElapsedTime(&gpu_time_ms, start, stop);

// Copy result back to host cudaMemcpy(h_C_GPU, d_C, matrixSize,

// Verify results bool resultsMatch = verifyResults(h_C_CPU,

// Print results printf("%d, %.2f, %.2f, %.2f\n", N, cpu_time.count(), gpu_time_ms,

if (!resultsMatch) { printf("ERROR: Results do

You might also like

B, float C, int N) {

global void matrixMultCUDA(float A, float B, float *C, int N) {

shared float sharedA[TILE_SIZE][TILE_SIZE];

shared float sharedB[TILE_SIZE][TILE_SIZE];

initializeMatrix(float matrix, int N) { for (int i = 0; i < N N; i++)

// Function to verify results bool verifyResults(float cpuResults, float gpuResults, int N)

(float)malloc(matrixSize); float h_B =

(float)malloc(matrixSize); float h_C_CPU =

(float)malloc(matrixSize); float h_C_GPU =

d_B, d_C; cudaMalloc((void**)&d_A,