Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
10 views21 pages

HPC Codes

The document contains multiple code implementations using OpenMP and CUDA for various algorithms including BFS, DFS, bubble sort, merge sort, and vector addition. Each section provides code snippets along with example outputs and instructions for compiling and running the programs. Additionally, it includes installation commands for the necessary CUDA toolkit.

Uploaded by

Om Pasalkar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views21 pages

HPC Codes

The document contains multiple code implementations using OpenMP and CUDA for various algorithms including BFS, DFS, bubble sort, merge sort, and vector addition. Each section provides code snippets along with example outputs and instructions for compiling and running the programs. Additionally, it includes installation commands for the necessary CUDA toolkit.

Uploaded by

Om Pasalkar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 21

Code to implement BFS using OpenMP:

#include<iostream>
#include<stdlib.h>
#include<queue>
using namespace std;

class node
{
public:

node *left, *right;


int data;

};

class Breadthfs
{

public:

node *insert(node *, int);


void bfs(node *);

};

node *insert(node *root, int data)


// inserts a node in tree
{

if(!root)
{

​ root=new node;
​ root->left=NULL;
​ root->right=NULL;
​ root->data=data;
​ return root;
}

queue<node *> q;
q.push(root);

while(!q.empty())
{

​ node *temp=q.front();
​ q.pop();

​ if(temp->left==NULL)
​ {
​ ​
​ ​ temp->left=new node;
​ ​ temp->left->left=NULL;
​ ​ temp->left->right=NULL;
​ ​ temp->left->data=data;
​ ​ return root;
​ }
​ else
​ {

​ q.push(temp->left);

​ }

​ if(temp->right==NULL)
​ {
​ ​
​ ​ temp->right=new node;
​ ​ temp->right->left=NULL;
​ ​ temp->right->right=NULL;
​ ​ temp->right->data=data;
​ ​ return root;
​ }
​ else
​ {

​ q.push(temp->right);

​ }

}
}

void bfs(node *head)


{

​ queue<node*> q;
​ q.push(head);

​ int qSize;

​ while (!q.empty())
​ {
​ ​ qSize = q.size();
​ ​ #pragma omp parallel for
​ //creates parallel threads
​ ​ for (int i = 0; i < qSize; i++)
​ ​ {
​ ​ ​ node* currNode;
​ ​ ​ #pragma omp critical
​ ​ ​ {
​ ​ ​ currNode = q.front();
​ ​ ​ q.pop();
​ ​ ​ cout<<"\t"<<currNode->data;
​ ​ ​
​ ​ ​ }// prints parent node
​ ​ ​ #pragma omp critical
​ ​ ​ {
​ ​ ​ if(currNode->left)// push parent's left node in queue
​ ​ ​ ​ q.push(currNode->left);
​ ​ ​ if(currNode->right)
​ ​ ​ ​ q.push(currNode->right);
​ ​ ​ }// push parent's right node in queue ​
​ ​ }
​ }

int main(){

node *root=NULL;

int data;
char ans;

do
{
​ cout<<"\n enter data=>";
​ cin>>data;

​ root=insert(root,data);

​ cout<<"do you want insert one more node?";


​ cin>>ans;

}while(ans=='y'||ans=='Y');

bfs(root);

return 0;
}

Run Commands:
1) g++ -fopenmp bfs.cpp -o bfs
2) ./bfs

Output:

Enter data => 5


Do you want to insert one more node? (y/n) y
Enter data => 3
Do you want to insert one more node? (y/n) y

Enter data => 2


Do you want to insert one more node? (y/n) y

Enter data => 1


Do you want to insert one more node? (y/n) y

Enter data => 7


Do you want to insert one more node? (y/n) y

Enter data => 8 Do you want to insert one more node? (y/n) n
5​ 3​ 7​ 2​ 1​ 8
Code to implement DFS using OpenMP:

#include<iostream>
#include <vector>
#include <stack>
#include <omp.h>

using namespace std;

const int MAX =100000;


vector<int>
graph[MAX];
bool visited[MAX];

void dfs(int node)


{
stack<int>
s;
s.push(node);

while (!s.empty())
{
int curr_node =s.top();
s.pop();

if (!visited[curr_node])
{
visited[curr_node] =true;

if (visited[curr_node])
{
cout << curr_node <<" ";
}

#pragma omp parallel for

for (int i = 0; i < graph[curr_node].size();i++)


{
int adj_node = graph[curr_node][i];
if (!visited[adj_node])
{
s.push(adj_node);
}
}
}
}
}

int main()
{
int n, m, start_node;
cout << "Enter No of Node, Edges, and start node:" ;
cin >> n >> m >> start_node;
//n: node,m:edges

cout << "Enter Pair of edges:" ;


for (int i = 0; i < m;i++)
{
int u, v;
cin >> u >> v;
//u and v: Pair of edges
graph[u].push_back(v);
graph[v].push_back(u);
}

#pragma omp parallel


for for (int i = 0; i < n;i++)
{
visited[i] = false;
}

dfs(start_node);
/* for (int i = 0; i < n;i++)
{
if (visited[i])
{
cout << i << " ";
}
}*/
return 0;
}

Output:
Code to Implement parallel bubble sort using OpenMP:

#include<iostream>
#include<stdlib.h>
#include<omp.h>
using namespace std;

void bubble(int *, int); // Pointer and Argument


void swap(int &, int &); // References as parameter

void bubble(int *a, int n) // array of integer a and size of array is n


{
for( int i = 0; i < n; i++ )
{ ​
​ int first = i % 2; ​

​ #pragma omp parallel for shared(a,first)


​ for( int j = first; j < n-1; j += 2 )
​ { ​
​ ​ if( a[ j ] > a[ j+1 ] )
​ ​ { ​
​ ​ ​ swap( a[ j ], a[ j+1 ] );
​ ​ } ​
​ ​ } ​
}
}

void swap(int &a, int &b)


{

int test;
test=a;
a=b;
b=test;

int main()
{

int *a,n;
cout<<"\n Enter total no of elements=>";
cin>>n;
a=new int[n];
cout<<"\n Enter elements=>";
for(int i=0;i<n;i++)
{
​ cin>>a[i];
}

bubble(a,n);

cout<<"\n Sorted array is=>";


for(int i=0;i<n;i++)
{
​ cout<<a[i]<<endl;
}

return 0;
}

Output:

Enter total no of elements=> 6


Enter elements=> 4
6
3
1
2
5
Sorted array is=> 1
2
3
4
5
6
Code to Implement parallel Merge sort using OpenMP:

#include<iostream>
#include<stdlib.h>
#include<omp.h>
using namespace std;

void mergesort(int a[],int i,int j);


void merge(int a[],int i1,int j1,int i2,int j2);

void mergesort(int a[],int i,int j)


{
​ int mid;
​ if(i<j)
​ {
​ mid=(i+j)/2;

​ #pragma omp parallel sections
​ {

​ #pragma omp section


​ {
​ mergesort(a,i,mid); ​
​ }

​ #pragma omp section


​ {
​ mergesort(a,mid+1,j);
​ }
​ }

​ merge(a,i,mid,mid+1,j);
​ }

void merge(int a[],int i1,int j1,int i2,int j2)


{
​ int temp[1000];
​ int i,j,k;
​ i=i1;
​ j=i2;
​ k=0;

​ while(i<=j1 && j<=j2)


​ {
​ if(a[i]<a[j])
​ {
​ temp[k++]=a[i++];
​ }
​ else
​ {
​ temp[k++]=a[j++];
}
​ }

​ while(i<=j1)
​ {
​ temp[k++]=a[i++];
​ }

​ while(j<=j2)
​ {
​ temp[k++]=a[j++];
​ }

​ for(i=i1,j=0;i<=j2;i++,j++)
​ {
​ a[i]=temp[j];
​ }
}

int main()
{
​ int *a,n,i;
​ cout<<"\n enter total no of elements=>";
​ cin>>n;
​ a= new int[n];
​ cout<<"\n enter elements=>";
​ for(i=0;i<n;i++)
​ {
​ cin>>a[i];
​ }
//​ start=.......
//#pragma omp…..
​ mergesort(a, 0, n-1);
// stop…….
​ cout<<"\n sorted array is=>";
​ for(i=0;i<n;i++)
​ {
​ cout<<"\n"<<a[i];
​ }
​ // Cout<<Stop-Start
​ return 0;
}

Output:

enter total no of elements=>5

enter elements=>3
6
5
2
4

sorted array is=>


2
3
4
5
6
Code to Implement Min, Max, Sum and Average operations using Parallel Reduction:

#include <iostream>
//#include <vector>
#include <omp.h>
#include <climits>
using namespace std;
void min_reduction(int arr[], int n) {
int min_value = INT_MAX;
#pragma omp parallel for reduction(min: min_value)
for (int i = 0; i < n; i++) {
​ if (arr[i] < min_value) {
​ min_value = arr[i];
​ }
}
cout << "Minimum value: " << min_value << endl;
}

void max_reduction(int arr[], int n) {


int max_value = INT_MIN;
#pragma omp parallel for reduction(max: max_value)
for (int i = 0; i < n; i++) {
​ if (arr[i] > max_value) {
​ max_value = arr[i];
​ }
}
cout << "Maximum value: " << max_value << endl;
}

void sum_reduction(int arr[], int n) {


int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < n; i++) {
​ sum += arr[i];
}
cout << "Sum: " << sum << endl;
}

void average_reduction(int arr[], int n) {


int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < n; i++) {
​ sum += arr[i];
}
cout << "Average: " << (double)sum / (n-1) << endl;
}

int main() {
int *arr,n;
cout<<"\n Enter total no of elements=>";
cin>>n;
arr=new int[n];
cout<<"\n Enter elements=>";
for(int i=0;i<n;i++)
{
​ cin>>arr[i];
}

// int arr[] = {5, 2, 9, 1, 7, 6, 8, 3, 4};


// int n = size(arr);

min_reduction(arr, n);
max_reduction(arr, n);
sum_reduction(arr, n);
average_reduction(arr, n);
}

Output:
Compile: g++ -fopenmp min.cpp -o min
Run: ./min

Enter total no of elements=>6


Enter elements=>2
8
6
9
5
4
Minimum value: 2
Maximum value: 9
Sum: 34
Average: 6.8
CUDA Program for Addition of Two Large Vectors:

#include <stdio.h>
#include <stdlib.h>

// CUDA kernel for vector addition


__global__ void vectorAdd(int *a, int *b, int *c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}

int main() {
int n = 1000000; // Vector size
int *a, *b, *c; // Host vectors
int *d_a, *d_b, *d_c; // Device vectors
int size = n * sizeof(int); // Size in bytes

// Allocate memory for host vectors


a = (int*) malloc(size);
b = (int*) malloc(size);
c = (int*) malloc(size);

// Initialize host vectors


for (int i = 0; i < n; i++) {
a[i] = i;
b[i] = i;
}

// Allocate memory for device vectors


cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);

// Copy host vectors to device vectors


cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

// Define block size and grid size


int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;

// Launch kernel
vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

// Copy device result vector to host result vector


cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

// Verify the result


for (int i = 0; i < n; i++) {
if (c[i] != 2*i) {
printf("Error: c[%d] = %d\n", i, c[i]);
break;
}
}

// Free device memory


cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

// Free host memory


free(a);
free(b);
free(c);
return 0;
}

To Install nvcc Program :

sudo apt-get install nvidia-cuda-toolkit

sudo apt-get install libglade2-0


To Install nvcc Program :

sudo apt-get install nvidia-cuda-toolkit

sudo apt-get install libglade2-0

Again run the below command -


sudo apt-get install nvidia-cuda-toolkit

CUDA Program for Addition of Two Large Vectors:

#include <stdio.h>
#define BLOCK_SIZE 16
__global__ void matrix_multiply(float *a, float *b, float *c, int n)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;

if (row < n && col < n)


{
for (int i = 0; i < n; ++i)
{
sum += a[row * n + i] * b[i * n + col];
}
c[row * n + col] = sum;
}
}

int main()
{
int n = 1024;
size_t size = n * n * sizeof(float);
float *a, *b, *c;
float *d_a, *d_b, *d_c;
cudaEvent_t start, stop;
float elapsed_time;
// Allocate host memory
a = (float*)malloc(size);
b = (float*)malloc(size);
c = (float*)malloc(size);

// Initialize matrices
for (int i = 0; i < n * n; ++i)
{
a[i] = i % n;
b[i] = i % n;
}

// Allocate device memory


cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);

// Copy input data to device


cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

// Set kernel launch configuration


dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 blocks((n + threads.x - 1) / threads.x, (n + threads.y - 1) / threads.y);

// Launch kernel
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matrix_multiply<<<blocks, threads>>>(d_a, d_b, d_c, n);
cudaEventRecord(stop); cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time, start, stop);

// Copy output data to host


cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

// Print elapsed time


printf("Elapsed time: %f ms\n", elapsed_time);

// Free device memory


cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

// Free host memory


free(a);
free(b);
free(c);
return 0;
}

You might also like