Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
12 views23 pages

Untitled Document

The document contains multiple MPI (Message Passing Interface) and OpenMP (Open Multi-Processing) code examples demonstrating parallel programming techniques. Key examples include calculating the value of pi using numerical integration, scattering and gathering data in a matrix, and finding the maximum value in an array. Additionally, it showcases matrix multiplication using both traditional and OpenMP methods, as well as prime number generation and Monte Carlo simulations for estimating pi.

Uploaded by

dmsuhagiyab22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views23 pages

Untitled Document

The document contains multiple MPI (Message Passing Interface) and OpenMP (Open Multi-Processing) code examples demonstrating parallel programming techniques. Key examples include calculating the value of pi using numerical integration, scattering and gathering data in a matrix, and finding the maximum value in an array. Additionally, it showcases matrix multiplication using both traditional and OpenMP methods, as well as prime number generation and Monte Carlo simulations for estimating pi.

Uploaded by

dmsuhagiyab22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 23

#include <mpi.

h>​
#include <iostream>​

int main(int argc, char** argv) {​
MPI_Init(&argc, &argv); // Initialize the MPI environment​

int world_size;​
MPI_Comm_size(MPI_COMM_WORLD, &world_size); // Get the total number of
processes​

int world_rank;​
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); // Get the rank of the
current process​

char processor_name[MPI_MAX_PROCESSOR_NAME];​
int name_len;​
MPI_Get_processor_name(processor_name, &name_len); // Get the
processor name​

// Print a hello message from each process​
std::cout << "Hello from processor " << processor_name​
<< ", rank " << world_rank​
<< " out of " << world_size << " processes.\n";​

MPI_Finalize(); // Finalize the MPI environment​
return 0;​
}​



















#include <mpi.h>​
#include <iostream>​
#include <cmath>​

int main(int argc, char* argv[]) {​
int rank, size, n;​
double pi = 0.0, local_sum = 0.0;​

MPI_Init(&argc, &argv);​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get process rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get number of processes​

if (size != 4) {​
if (rank == 0) {​
std::cerr << "This program requires exactly 4 processes.\n";​
}​
MPI_Finalize();​
return 1;​
}​

if (rank == 0) {​
std::cout << "Enter the number of intervals (n): ";​
std::cin >> n;​
}​

// Broadcast n to all processes​
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);​

// Start timer​
double start_time = MPI_Wtime();​

double h = 1.0 / n;​
double start = rank * 0.25; // Each process handles a 0.25 chunk​
double end = (rank + 1) * 0.25;​

for (double x = start; x < end; x += h) {​
double mid = x + h / 2.0;​
local_sum += 4.0 / (1.0 + mid * mid); // Function being integrated​
}​

local_sum *= h; // Multiply sum by step size h​

// Combine all local sums into pi on rank 0​
MPI_Reduce(&local_sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);​

// Stop timer​
double end_time = MPI_Wtime();​

if (rank == 0) {​
std::cout << "Estimated value of pi: " << pi << std::endl;​
std::cout << "Time taken: " << end_time - start_time << "
seconds\n";​
}​

MPI_Finalize();​
return 0;​
}​


=====================================================================



#include <iostream>​
#include <mpi.h> // Not "mpl.h" :)​
#define MATRIX_SIZE 4​

int main(int argc, char** argv) {​
MPI_Init(&argc, &argv); // Initialize MPI​

int rank, size;​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes​

char matrix[MATRIX_SIZE][MATRIX_SIZE]; // 4x4 matrix​
char recv_data[MATRIX_SIZE]; // Each process receives 1 row​

double start_time = MPI_Wtime(); // Start timing​

if (rank == 0) {​
// Initialize matrix with example characters​
char values[16] = {​
'o', 'd', 'v', 'g',​
'e', 'x', 't', 'q',​
'a', 's', 'y', 'u',​
'z', 'n', 'b', 'c'​
};​

int idx = 0;​
for (int i = 0; i < MATRIX_SIZE; ++i) {​
for (int j = 0; j < MATRIX_SIZE; ++j) {​
matrix[i][j] = values[idx++];​
}​
}​
}​

// Scatter one row to each process​
MPI_Scatter(matrix, MATRIX_SIZE, MPI_CHAR,​
recv_data, MATRIX_SIZE, MPI_CHAR,​
0, MPI_COMM_WORLD);​

// Each process prints its received row​
std::cout << "Rank " << rank << " received: ";​
for (int i = 0; i < MATRIX_SIZE; ++i) {​
std::cout << recv_data[i] << " ";​
}​
std::cout << std::endl;​

double end_time = MPI_Wtime(); // End timing​

if (rank == 0) {​
std::cout << "Time taken for scattering the matrix: "​
<< (end_time - start_time) << " seconds\n";​
}​

MPI_Finalize();​
return 0;​
}



=====================================================================​

#include <mpi.h>​
#include <iostream>​
using namespace std;​

int main(int argc, char* argv[]) {​
int rank, size, send_data, recv_data;​

MPI_Init(&argc, &argv); // Initialize MPI​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get process rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes​

send_data = rank; // Each process sends its rank as data​

int next = (rank + 1) % size; // Next process in the ring​
int prev = (rank - 1 + size) % size; // Previous process (handles
wraparound)​

// Send to next, receive from previous​
MPI_Sendrecv(&send_data, 1, MPI_INT, next, 0,​
&recv_data, 1, MPI_INT, prev, 0,​
MPI_COMM_WORLD, MPI_STATUS_IGNORE);​

cout << "Process " << rank << " received data " << recv_data​
<< " from process " << prev << endl;​

MPI_Finalize(); // Finalize MPI​
return 0;​
}



====================================================================​

#include <mpi.h>​
#include <iostream>​
using namespace std;​

int main(int argc, char* argv[]) {​
int rank, size, local_max, global_max;​
const int n = 8; // Size of the array​
int arr[n] = {12, 45, 67, 89, 23, 78, 56, 99}; // Array elements​

MPI_Init(&argc, &argv); // Initialize MPI​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get number of processes​

int chunk_size = n / size; // Divide array among processes​
int local_arr[chunk_size]; // Each process receives part of the array​

// Scatter the array to all processes​
MPI_Scatter(arr, chunk_size, MPI_INT,​
local_arr, chunk_size, MPI_INT,​
0, MPI_COMM_WORLD);​

// Find local maximum in each process's chunk​
local_max = local_arr[0];​
for (int i = 1; i < chunk_size; i++) {​
if (local_arr[i] > local_max)​
local_max = local_arr[i];​
}​

// Reduce local max values to find global max at root process​
MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX,​
0, MPI_COMM_WORLD);​

// Only root process prints the result​
if (rank == 0) {​
cout << "Maximum value in the array: " << global_max << endl;​
}​

MPI_Finalize(); // Finalize MPI​
return 0;​
}​


======================================================================​


#include <iostream>​
#include <vector>​
#include <chrono>​
#include <omp.h>​
using namespace std;​

void matrixMultiplyTraditional(const vector<vector<int>>& A,​
const vector<vector<int>>& B,​
vector<vector<int>>& C) {​
int N = A.size();​
for (int i = 0; i < N; ++i) {​
for (int j = 0; j < N; ++j) {​
C[i][j] = 0;​
for (int k = 0; k < N; ++k) {​
C[i][j] += A[i][k] * B[k][j];​
}​
}​
}​
}​

void matrixMultiplyOpenMP(const vector<vector<int>>& A,​
const vector<vector<int>>& B,​
vector<vector<int>>& C) {​
int N = A.size();​
#pragma omp parallel for schedule(static)​
for (int i = 0; i < N; ++i) {​
for (int j = 0; j < N; ++j) {​
C[i][j] = 0;​
for (int k = 0; k < N; ++k) {​
C[i][j] += A[i][k] * B[k][j];​
}​
}​
}​
}​

int main() {​
const int N = 1000; // Size of the square matrix​
vector<vector<int>> A(N, vector<int>(N, 1)); // Matrix A filled with 1s​
vector<vector<int>> B(N, vector<int>(N, 2)); // Matrix B filled with 2s​
vector<vector<int>> C(N, vector<int>(N, 0)); // Result matrix​

// Traditional Multiplication​
auto start = chrono::high_resolution_clock::now();​
matrixMultiplyTraditional(A, B, C);​
auto end = chrono::high_resolution_clock::now();​
auto durationTraditional =
chrono::duration_cast<chrono::milliseconds>(end - start);​
cout << "Traditional Matrix Multiplication Time: " <<
durationTraditional.count() << " milliseconds\n";​

// Reset result matrix​
C = vector<vector<int>>(N, vector<int>(N, 0));​

// OpenMP Multiplication​
start = chrono::high_resolution_clock::now();​
matrixMultiplyOpenMP(A, B, C);​
end = chrono::high_resolution_clock::now();​
auto durationOpenMP = chrono::duration_cast<chrono::milliseconds>(end -
start);​
cout << "OpenMP Matrix Multiplication Time: " << durationOpenMP.count()
<< " milliseconds\n";​

return 0;​
}​

======================================================================​
#include <stdio.h>​
#include <omp.h>​

#define SIZE 10​

int main() {​
int arr[SIZE] = {12, 45, 67, 89, 23, 78, 90, 34, 99, 56};​
int max_value = arr[0];​

#pragma omp parallel for​
for (int i = 1; i < SIZE; i++) {​
#pragma omp critical​
{​
if (arr[i] > max_value) {​
max_value = arr[i];​
}​
}​
}​

printf("Maximum value in the array: %d\n", max_value);​
return 0;​
}​


========================================================================​


#include <iostream>​
#include <cstdlib>​
#include <omp.h>​
#define NUM_POINTS 100000000​

using namespace std;​

int main() {​
int count = 0;​

#pragma omp parallel​
{​
int local_count = 0;​
unsigned int seed = 42 + omp_get_thread_num(); // Unique seed for
each thread​

#pragma omp for​
for (int i = 0; i < NUM_POINTS; i++) {​
double x = (double)rand_r(&seed) / RAND_MAX;​
double y = (double)rand_r(&seed) / RAND_MAX;​

if (x * x + y * y <= 1.0)​
local_count++;​
}​

#pragma omp atomic​
count += local_count;​
}​

double pi = 4.0 * count / NUM_POINTS;​
printf("Estimated Pi value: %f\n", pi);​

return 0;​
}​

=======================================================================​


#include <iostream>​
#include <vector>​
#include <omp.h>​

#define MAX_SIZE 1000000 // Expected max number of primes​

// Check if a number is prime​
bool is_prime(int num) {​
if (num < 2)​
return false;​
for (int i = 2; i * i <= num; ++i) {​
if (num % i == 0)​
return false;​
}​
return true;​
}​

int main() {​
int N;​
std::cout << "Enter the value of N: ";​
std::cin >> N;​

std::vector<int> prime_array; // Shared result array​
omp_lock_t lock;​
omp_init_lock(&lock);​

// Parallel region​
#pragma omp parallel​
{​
std::vector<int> local_primes;​

#pragma omp for schedule(dynamic)​
for (int i = 2; i <= N; ++i) {​
if (is_prime(i)) {​
local_primes.push_back(i);​
}​
}​

// Lock-protected merge into shared vector​
omp_set_lock(&lock);​
prime_array.insert(prime_array.end(), local_primes.begin(),
local_primes.end());​
omp_unset_lock(&lock);​
}​

omp_destroy_lock(&lock);​

// Output results​
std::cout << "Prime numbers between 2 and " << N << ":\n";​
for (int prime : prime_array) {​
std::cout << prime << " ";​
}​
std::cout << "\nTotal primes found: " << prime_array.size() <<
std::endl;​

return 0;​
}​


======================================================================​


#include <iostream>​
#include <cstdlib>​
#include <ctime>​
#include <omp.h>​
#include <mpi.h>​

#define TOTAL_POINTS 100000000 // Total points to be generated​

using namespace std;​

int main(int argc, char* argv[]) {​
int rank, size;​

MPI_Init(&argc, &argv); // Initialize MPI​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes​

int points_per_process = TOTAL_POINTS / size;​
int local_count = 0;​

// Seed random generator uniquely per thread​
unsigned int seed_base = time(NULL) + rank;​

#pragma omp parallel​
{​
int thread_count = 0;​
unsigned int seed = seed_base + omp_get_thread_num();​

#pragma omp for reduction(+:local_count) schedule(static)​
for (int i = 0; i < points_per_process; ++i) {​
double x = static_cast<double>(rand_r(&seed)) / RAND_MAX;​
double y = static_cast<double>(rand_r(&seed)) / RAND_MAX;​

if (x * x + y * y <= 1.0) {​
local_count++;​
}​
}​
}​

int global_count = 0;​

// Reduce results from all processes​
MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0,
MPI_COMM_WORLD);​

if (rank == 0) {​
double pi = 4.0 * global_count / TOTAL_POINTS;​
cout << "Estimated value of Pi = " << pi << endl;​
}​

MPI_Finalize();​
return 0;​
}​

=======================================================================​

#include <iostream>​
#include <omp.h>​
#include <mpi.h>​

using namespace std;​

int main(int argc, char* argv[]) {​
int rank, size;​
const long num_intervals = 1000000000; // Total intervals for
integration​
double step = 1.0 / num_intervals;​
double local_sum = 0.0, global_sum = 0.0;​

MPI_Init(&argc, &argv); // Initialize MPI​
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank​
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes​

long local_start = rank * (num_intervals / size);​
long local_end = (rank + 1) * (num_intervals / size);​

// Parallel region using OpenMP​
#pragma omp parallel for reduction(+:local_sum) schedule(dynamic)​
for (long i = local_start; i < local_end; ++i) {​
double x = (i + 0.5) * step;​
local_sum += 1.0 / (1.0 + x * x);​
}​

// Combine results from all MPI processes​
MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD);​

if (rank == 0) {​
double pi = 4.0 * step * global_sum;​
cout << "Estimated value of Pi using numerical integration: " << pi
<< endl;​
}​

MPI_Finalize();​
return 0;​
}​


========================================================================​

#include <iostream>​
__global__ void helloFromGPU() {​
printf("Hello World from GPU thread %d!\n", threadIdx.x);​
}​

int main() {​
helloFromGPU<<<1, 5>>>(); // 1 block, 5 threads​
cudaDeviceSynchronize();​
return 0;​
}



========================================================================​
#include <iostream>​
#include <cuda_runtime.h>​

__global__ void prefix_sum(int *in, int *out, int n) {​
int tid = threadIdx.x;​

if (tid < n) {​
int sum = 0;​
for (int i = 0; i <= tid; i++) {​
sum += in[i]; // fix: in[i] not in[1]​
}​
out[tid] = sum;​
}​
}​

int main() {​
const int N = 8;​
int h_in[N] = {1, 2, 3, 4, 5, 6, 7, 8};​
int h_out[N];​

int *d_in, *d_out;​
cudaMalloc(&d_in, N * sizeof(int));​
cudaMalloc(&d_out, N * sizeof(int));​

cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);​

prefix_sum<<<1, N>>>(d_in, d_out, N);​
cudaDeviceSynchronize();​

cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost);​

std::cout << "Prefix Sum Output:\n";​
for (int i = 0; i < N; i++) {​
std::cout << h_out[i] << " ";​
}​
std::cout << std::endl;​

cudaFree(d_in);​
cudaFree(d_out);​

return 0;​
}

========================================================================
#include <iostream>​
#include <cuda_runtime.h>​
#define N 512​

__global__ void transposeKernel(int *in, int *out) {​
int row = threadIdx.y;​
int col = threadIdx.x;​
int idx = row * N + col;​
int transposed_idx = col * N + row;​
if (row < N && col < N)​
out[transposed_idx] = in[idx];​
}​

void cpuTranspose(int* in, int* out) {​
for (int i = 0; i < N; ++i)​
for (int j = 0; j < N; ++j)​
out[j * N + i] = in[i * N + j];​
}​

int main() {​
int size = N * N * sizeof(int);​
int *h_in = new int[N * N];​
int *h_out_cpu = new int[N * N];​
int *h_out_gpu = new int[N * N];​

for (int i = 0; i < N * N; i++) h_in[i] = i;​

int *d_in, *d_out;​
cudaMalloc(&d_in, size);​
cudaMalloc(&d_out, size);​
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);​

dim3 threadsPerBlock(N, N);​
transposeKernel<<<1, threadsPerBlock>>>(d_in, d_out);​
cudaMemcpy(h_out_gpu, d_out, size, cudaMemcpyDeviceToHost);​

cpuTranspose(h_in, h_out_cpu);​

// Optional: Compare results​

cudaFree(d_in); cudaFree(d_out);​
delete[] h_in; delete[] h_out_cpu; delete[] h_out_gpu;​
return 0;}

#include <iostream>​
#include <cuda_runtime.h>​
#define N 1024​

__global__ void vectorMulKernel(int *a, int *b, int *c) {​
int i = threadIdx.x + blockIdx.x * blockDim.x;​
if (i < N)​
c[i] = a[i] * b[i];​
}​

void cpuVectorMul(int* a, int* b, int* c) {​
for (int i = 0; i < N; ++i)​
c[i] = a[i] * b[i];​
}​

int main() {​
int size = N * sizeof(int);​
int *h_a = new int[N];​
int *h_b = new int[N];​
int *h_c_cpu = new int[N];​
int *h_c_gpu = new int[N];​

for (int i = 0; i < N; i++) {​
h_a[i] = i;​
h_b[i] = i * 2;​
}​

int *d_a, *d_b, *d_c;​
cudaMalloc(&d_a, size);​
cudaMalloc(&d_b, size);​
cudaMalloc(&d_c, size);​

cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);​
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);​

vectorMulKernel<<<(N+255)/256, 256>>>(d_a, d_b, d_c);​
cudaMemcpy(h_c_gpu, d_c, size, cudaMemcpyDeviceToHost);​

cpuVectorMul(h_a, h_b, h_c_cpu);​

// Optional: Compare results​

cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);​
delete[] h_a; delete[] h_b; delete[] h_c_cpu; delete[] h_c_gpu;​
return 0;​
}​

You might also like