#include <mpi.
h>
#include <iostream>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv); // Initialize the MPI environment
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size); // Get the total number of
processes
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); // Get the rank of the
current process
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len); // Get the
processor name
// Print a hello message from each process
std::cout << "Hello from processor " << processor_name
<< ", rank " << world_rank
<< " out of " << world_size << " processes.\n";
MPI_Finalize(); // Finalize the MPI environment
return 0;
}
#include <mpi.h>
#include <iostream>
#include <cmath>
int main(int argc, char* argv[]) {
int rank, size, n;
double pi = 0.0, local_sum = 0.0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get process rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get number of processes
if (size != 4) {
if (rank == 0) {
std::cerr << "This program requires exactly 4 processes.\n";
}
MPI_Finalize();
return 1;
}
if (rank == 0) {
std::cout << "Enter the number of intervals (n): ";
std::cin >> n;
}
// Broadcast n to all processes
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
// Start timer
double start_time = MPI_Wtime();
double h = 1.0 / n;
double start = rank * 0.25; // Each process handles a 0.25 chunk
double end = (rank + 1) * 0.25;
for (double x = start; x < end; x += h) {
double mid = x + h / 2.0;
local_sum += 4.0 / (1.0 + mid * mid); // Function being integrated
}
local_sum *= h; // Multiply sum by step size h
// Combine all local sums into pi on rank 0
MPI_Reduce(&local_sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
// Stop timer
double end_time = MPI_Wtime();
if (rank == 0) {
std::cout << "Estimated value of pi: " << pi << std::endl;
std::cout << "Time taken: " << end_time - start_time << "
seconds\n";
}
MPI_Finalize();
return 0;
}
=====================================================================
#include <iostream>
#include <mpi.h> // Not "mpl.h" :)
#define MATRIX_SIZE 4
int main(int argc, char** argv) {
MPI_Init(&argc, &argv); // Initialize MPI
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes
char matrix[MATRIX_SIZE][MATRIX_SIZE]; // 4x4 matrix
char recv_data[MATRIX_SIZE]; // Each process receives 1 row
double start_time = MPI_Wtime(); // Start timing
if (rank == 0) {
// Initialize matrix with example characters
char values[16] = {
'o', 'd', 'v', 'g',
'e', 'x', 't', 'q',
'a', 's', 'y', 'u',
'z', 'n', 'b', 'c'
};
int idx = 0;
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
matrix[i][j] = values[idx++];
}
}
}
// Scatter one row to each process
MPI_Scatter(matrix, MATRIX_SIZE, MPI_CHAR,
recv_data, MATRIX_SIZE, MPI_CHAR,
0, MPI_COMM_WORLD);
// Each process prints its received row
std::cout << "Rank " << rank << " received: ";
for (int i = 0; i < MATRIX_SIZE; ++i) {
std::cout << recv_data[i] << " ";
}
std::cout << std::endl;
double end_time = MPI_Wtime(); // End timing
if (rank == 0) {
std::cout << "Time taken for scattering the matrix: "
<< (end_time - start_time) << " seconds\n";
}
MPI_Finalize();
return 0;
}
=====================================================================
#include <mpi.h>
#include <iostream>
using namespace std;
int main(int argc, char* argv[]) {
int rank, size, send_data, recv_data;
MPI_Init(&argc, &argv); // Initialize MPI
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get process rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes
send_data = rank; // Each process sends its rank as data
int next = (rank + 1) % size; // Next process in the ring
int prev = (rank - 1 + size) % size; // Previous process (handles
wraparound)
// Send to next, receive from previous
MPI_Sendrecv(&send_data, 1, MPI_INT, next, 0,
&recv_data, 1, MPI_INT, prev, 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
cout << "Process " << rank << " received data " << recv_data
<< " from process " << prev << endl;
MPI_Finalize(); // Finalize MPI
return 0;
}
====================================================================
#include <mpi.h>
#include <iostream>
using namespace std;
int main(int argc, char* argv[]) {
int rank, size, local_max, global_max;
const int n = 8; // Size of the array
int arr[n] = {12, 45, 67, 89, 23, 78, 56, 99}; // Array elements
MPI_Init(&argc, &argv); // Initialize MPI
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get number of processes
int chunk_size = n / size; // Divide array among processes
int local_arr[chunk_size]; // Each process receives part of the array
// Scatter the array to all processes
MPI_Scatter(arr, chunk_size, MPI_INT,
local_arr, chunk_size, MPI_INT,
0, MPI_COMM_WORLD);
// Find local maximum in each process's chunk
local_max = local_arr[0];
for (int i = 1; i < chunk_size; i++) {
if (local_arr[i] > local_max)
local_max = local_arr[i];
}
// Reduce local max values to find global max at root process
MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX,
0, MPI_COMM_WORLD);
// Only root process prints the result
if (rank == 0) {
cout << "Maximum value in the array: " << global_max << endl;
}
MPI_Finalize(); // Finalize MPI
return 0;
}
======================================================================
#include <iostream>
#include <vector>
#include <chrono>
#include <omp.h>
using namespace std;
void matrixMultiplyTraditional(const vector<vector<int>>& A,
const vector<vector<int>>& B,
vector<vector<int>>& C) {
int N = A.size();
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
C[i][j] = 0;
for (int k = 0; k < N; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
void matrixMultiplyOpenMP(const vector<vector<int>>& A,
const vector<vector<int>>& B,
vector<vector<int>>& C) {
int N = A.size();
#pragma omp parallel for schedule(static)
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
C[i][j] = 0;
for (int k = 0; k < N; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
int main() {
const int N = 1000; // Size of the square matrix
vector<vector<int>> A(N, vector<int>(N, 1)); // Matrix A filled with 1s
vector<vector<int>> B(N, vector<int>(N, 2)); // Matrix B filled with 2s
vector<vector<int>> C(N, vector<int>(N, 0)); // Result matrix
// Traditional Multiplication
auto start = chrono::high_resolution_clock::now();
matrixMultiplyTraditional(A, B, C);
auto end = chrono::high_resolution_clock::now();
auto durationTraditional =
chrono::duration_cast<chrono::milliseconds>(end - start);
cout << "Traditional Matrix Multiplication Time: " <<
durationTraditional.count() << " milliseconds\n";
// Reset result matrix
C = vector<vector<int>>(N, vector<int>(N, 0));
// OpenMP Multiplication
start = chrono::high_resolution_clock::now();
matrixMultiplyOpenMP(A, B, C);
end = chrono::high_resolution_clock::now();
auto durationOpenMP = chrono::duration_cast<chrono::milliseconds>(end -
start);
cout << "OpenMP Matrix Multiplication Time: " << durationOpenMP.count()
<< " milliseconds\n";
return 0;
}
======================================================================
#include <stdio.h>
#include <omp.h>
#define SIZE 10
int main() {
int arr[SIZE] = {12, 45, 67, 89, 23, 78, 90, 34, 99, 56};
int max_value = arr[0];
#pragma omp parallel for
for (int i = 1; i < SIZE; i++) {
#pragma omp critical
{
if (arr[i] > max_value) {
max_value = arr[i];
}
}
}
printf("Maximum value in the array: %d\n", max_value);
return 0;
}
========================================================================
#include <iostream>
#include <cstdlib>
#include <omp.h>
#define NUM_POINTS 100000000
using namespace std;
int main() {
int count = 0;
#pragma omp parallel
{
int local_count = 0;
unsigned int seed = 42 + omp_get_thread_num(); // Unique seed for
each thread
#pragma omp for
for (int i = 0; i < NUM_POINTS; i++) {
double x = (double)rand_r(&seed) / RAND_MAX;
double y = (double)rand_r(&seed) / RAND_MAX;
if (x * x + y * y <= 1.0)
local_count++;
}
#pragma omp atomic
count += local_count;
}
double pi = 4.0 * count / NUM_POINTS;
printf("Estimated Pi value: %f\n", pi);
return 0;
}
=======================================================================
#include <iostream>
#include <vector>
#include <omp.h>
#define MAX_SIZE 1000000 // Expected max number of primes
// Check if a number is prime
bool is_prime(int num) {
if (num < 2)
return false;
for (int i = 2; i * i <= num; ++i) {
if (num % i == 0)
return false;
}
return true;
}
int main() {
int N;
std::cout << "Enter the value of N: ";
std::cin >> N;
std::vector<int> prime_array; // Shared result array
omp_lock_t lock;
omp_init_lock(&lock);
// Parallel region
#pragma omp parallel
{
std::vector<int> local_primes;
#pragma omp for schedule(dynamic)
for (int i = 2; i <= N; ++i) {
if (is_prime(i)) {
local_primes.push_back(i);
}
}
// Lock-protected merge into shared vector
omp_set_lock(&lock);
prime_array.insert(prime_array.end(), local_primes.begin(),
local_primes.end());
omp_unset_lock(&lock);
}
omp_destroy_lock(&lock);
// Output results
std::cout << "Prime numbers between 2 and " << N << ":\n";
for (int prime : prime_array) {
std::cout << prime << " ";
}
std::cout << "\nTotal primes found: " << prime_array.size() <<
std::endl;
return 0;
}
======================================================================
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <omp.h>
#include <mpi.h>
#define TOTAL_POINTS 100000000 // Total points to be generated
using namespace std;
int main(int argc, char* argv[]) {
int rank, size;
MPI_Init(&argc, &argv); // Initialize MPI
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes
int points_per_process = TOTAL_POINTS / size;
int local_count = 0;
// Seed random generator uniquely per thread
unsigned int seed_base = time(NULL) + rank;
#pragma omp parallel
{
int thread_count = 0;
unsigned int seed = seed_base + omp_get_thread_num();
#pragma omp for reduction(+:local_count) schedule(static)
for (int i = 0; i < points_per_process; ++i) {
double x = static_cast<double>(rand_r(&seed)) / RAND_MAX;
double y = static_cast<double>(rand_r(&seed)) / RAND_MAX;
if (x * x + y * y <= 1.0) {
local_count++;
}
}
}
int global_count = 0;
// Reduce results from all processes
MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0,
MPI_COMM_WORLD);
if (rank == 0) {
double pi = 4.0 * global_count / TOTAL_POINTS;
cout << "Estimated value of Pi = " << pi << endl;
}
MPI_Finalize();
return 0;
}
=======================================================================
#include <iostream>
#include <omp.h>
#include <mpi.h>
using namespace std;
int main(int argc, char* argv[]) {
int rank, size;
const long num_intervals = 1000000000; // Total intervals for
integration
double step = 1.0 / num_intervals;
double local_sum = 0.0, global_sum = 0.0;
MPI_Init(&argc, &argv); // Initialize MPI
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Get current process rank
MPI_Comm_size(MPI_COMM_WORLD, &size); // Get total number of processes
long local_start = rank * (num_intervals / size);
long local_end = (rank + 1) * (num_intervals / size);
// Parallel region using OpenMP
#pragma omp parallel for reduction(+:local_sum) schedule(dynamic)
for (long i = local_start; i < local_end; ++i) {
double x = (i + 0.5) * step;
local_sum += 1.0 / (1.0 + x * x);
}
// Combine results from all MPI processes
MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD);
if (rank == 0) {
double pi = 4.0 * step * global_sum;
cout << "Estimated value of Pi using numerical integration: " << pi
<< endl;
}
MPI_Finalize();
return 0;
}
========================================================================
#include <iostream>
__global__ void helloFromGPU() {
printf("Hello World from GPU thread %d!\n", threadIdx.x);
}
int main() {
helloFromGPU<<<1, 5>>>(); // 1 block, 5 threads
cudaDeviceSynchronize();
return 0;
}
========================================================================
#include <iostream>
#include <cuda_runtime.h>
__global__ void prefix_sum(int *in, int *out, int n) {
int tid = threadIdx.x;
if (tid < n) {
int sum = 0;
for (int i = 0; i <= tid; i++) {
sum += in[i]; // fix: in[i] not in[1]
}
out[tid] = sum;
}
}
int main() {
const int N = 8;
int h_in[N] = {1, 2, 3, 4, 5, 6, 7, 8};
int h_out[N];
int *d_in, *d_out;
cudaMalloc(&d_in, N * sizeof(int));
cudaMalloc(&d_out, N * sizeof(int));
cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
prefix_sum<<<1, N>>>(d_in, d_out, N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Prefix Sum Output:\n";
for (int i = 0; i < N; i++) {
std::cout << h_out[i] << " ";
}
std::cout << std::endl;
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
========================================================================
#include <iostream>
#include <cuda_runtime.h>
#define N 512
__global__ void transposeKernel(int *in, int *out) {
int row = threadIdx.y;
int col = threadIdx.x;
int idx = row * N + col;
int transposed_idx = col * N + row;
if (row < N && col < N)
out[transposed_idx] = in[idx];
}
void cpuTranspose(int* in, int* out) {
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
out[j * N + i] = in[i * N + j];
}
int main() {
int size = N * N * sizeof(int);
int *h_in = new int[N * N];
int *h_out_cpu = new int[N * N];
int *h_out_gpu = new int[N * N];
for (int i = 0; i < N * N; i++) h_in[i] = i;
int *d_in, *d_out;
cudaMalloc(&d_in, size);
cudaMalloc(&d_out, size);
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(N, N);
transposeKernel<<<1, threadsPerBlock>>>(d_in, d_out);
cudaMemcpy(h_out_gpu, d_out, size, cudaMemcpyDeviceToHost);
cpuTranspose(h_in, h_out_cpu);
// Optional: Compare results
cudaFree(d_in); cudaFree(d_out);
delete[] h_in; delete[] h_out_cpu; delete[] h_out_gpu;
return 0;}
#include <iostream>
#include <cuda_runtime.h>
#define N 1024
__global__ void vectorMulKernel(int *a, int *b, int *c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N)
c[i] = a[i] * b[i];
}
void cpuVectorMul(int* a, int* b, int* c) {
for (int i = 0; i < N; ++i)
c[i] = a[i] * b[i];
}
int main() {
int size = N * sizeof(int);
int *h_a = new int[N];
int *h_b = new int[N];
int *h_c_cpu = new int[N];
int *h_c_gpu = new int[N];
for (int i = 0; i < N; i++) {
h_a[i] = i;
h_b[i] = i * 2;
}
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
vectorMulKernel<<<(N+255)/256, 256>>>(d_a, d_b, d_c);
cudaMemcpy(h_c_gpu, d_c, size, cudaMemcpyDeviceToHost);
cpuVectorMul(h_a, h_b, h_c_cpu);
// Optional: Compare results
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
delete[] h_a; delete[] h_b; delete[] h_c_cpu; delete[] h_c_gpu;
return 0;
}