I am trying to run the same C++ parallel code to calculated pi with Monte Carlo algorithm on Windows and Linux with the same number of threads (4 threads with 4 CPUs). While the parallel code is faster than the serial implementation on Windows, it is much slower on Linux.
Here is the program:
#include <iostream> #include <cstdlib> #include <ctime> #include <cmath> #include <pthread.h> #include <chrono> using namespace std; using ns = chrono::nanoseconds; using get_time = chrono::steady_clock; static int thread_count = 4; pthread_mutex_t myMutex; struct args{ int id; int random_count; double *pi; }; double compute_pi(long n) { double pi = 0; double x, y; for(long i=0; i<n; i++){ x = -1 + 2 * double(rand())/RAND_MAX; y = -1 + 2 * double(rand())/RAND_MAX; if (sqrt(x*x + y*y) <= 1.0) pi++; } return 4*pi/n; } void* threadFunc(void *argin){ args *inputs = (args*) argin; double my_sum = 0; double x, y; for(int i=0; i<inputs->random_count; i++){ x = -1 + 2 * double(rand())/RAND_MAX; y = -1 + 2 * double(rand())/RAND_MAX; if (sqrt(x*x + y*y) <= 1.0) my_sum++; } pthread_mutex_lock(&myMutex); *(inputs->pi) += my_sum; pthread_mutex_unlock(&myMutex); return nullptr; } double compute_pi_parallel(long n) { double pi = 0; int count_per_thread = n/thread_count; pthread_t *threads = new pthread_t[thread_count]; args *funcInputs = new args[thread_count]; pthread_mutex_init(&myMutex, nullptr); for(int i=0; i<thread_count; i++){ funcInputs[i].id = i; funcInputs[i].random_count = i<n%thread_count ? count_per_thread+1 : count_per_thread; funcInputs[i].pi = π int rc = pthread_create(&threads[i], nullptr, threadFunc, (void *) &funcInputs[i]); if(rc) cerr << "error in thread creation!n"; } for(int i=0; i<thread_count; i++){ int rc = pthread_join(threads[i], nullptr); if(rc) cerr << "Error in thread join!n"; } pthread_mutex_destroy(&myMutex); delete [] funcInputs; delete [] threads; return 4*pi/n; } int main(int argc, char* argv[]) { srand(time(nullptr)); long n = 100000000; auto start = get_time::now(); if (argc > 1){ n = atol(argv[1]); if (argc == 3){ thread_count = atoi(argv[2]); cout << "pi(parallel) = " << compute_pi_parallel(n) << endl; auto stop = get_time::now(); auto diff = stop - start; cout<<"Elapsed time is : "<< chrono::duration_cast<ns> (diff).count()/1e9<<" s "<<endl; return 0; } } cout << "pi = " << compute_pi(n) << endl; auto stop_s = get_time::now(); auto diff_s = stop_s - start; cout << "pi(parallel) = " << compute_pi_parallel(n) << endl; auto stop_p = get_time::now(); auto diff = stop_p - stop_s; cout<<"Elapsed time for serial is : "<< chrono::duration_cast<ns> (diff_s).count()/1e9<<" s "<<endl; cout<<"Number of threads: "<< thread_count<< endl; cout<<"Elapsed time for parallel is : "<< chrono::duration_cast<ns> (diff).count()/1e9<<" s "<<endl; return 0; }
output on Windows:
pi = 3.14146 pi(parallel) = 3.14087 Elapsed time for serial is : 6.16426 s Number of threads: 4 Elapsed time for parallel is : 1.0659 s
on Linux:$g++ -std=c++11 -g -Wall -o mc mc.cpp -lpthread
output:
pi = 3.14138 pi(parallel) = 3.14166 Elapsed time for serial is : 3.10837 s Number of threads: 4 Elapsed time for parallel is : 19.8226 s
I checked the number of CPUs on Linux with $lscpu and monitored the CPU usage with $top it seemed that Linux was using all of the available cores, but still it was slower than the serial code. I am running the program on Ubuntu 16.04 LTS on a Virtual Machine from Windows.
I am wondering if there is something that I am doing wrong on Linux.
Advertisement
Answer
You are using rand. It is implementation defined if rand is thread safe. It may simply invoke a mutex. Use a modern C++ random number generator that does not have a global state.