Why use the precompiler to add a line of code, the speed change is so great?
#include <stdio.h> #include <time.h> #include <set> #include <vector> #include <list> #include <iostream> #include <algorithm> #include <functional> #ifndef LINUX #include <windows.h> #else #include <stdlib.h> #include <unistd.h> #include <string.h> #include <sys/time.h> #endif using namespace std; const int SIX_MILLION = 6000000; const int ONE_MILLION = 1000000; unsigned long long randint() { return (unsigned long long)(rand() & 0xFFFF) << 16 | (unsigned long long)(rand() & 0xFFFF); } #ifdef LINUX int time_substract(struct timeval *result, struct timeval *begin, struct timeval *end) { if (begin->tv_sec > end->tv_sec) return -1; if ((begin->tv_sec == end->tv_sec) && (begin->tv_usec > end->tv_usec)) return -2; result->tv_sec = (end->tv_sec - begin->tv_sec); result->tv_usec = (end->tv_usec - begin->tv_usec); if (result->tv_usec < 0) { result->tv_sec--; result->tv_usec += ONE_MILLION; } return 0; } #endif double time_it(function<void()> func, int times = 3) { #ifndef LINUX LARGE_INTEGER lpStart[1], lpEnd[1], lpFreq[1]; vector<double> result(times); ::QueryPerformanceFrequency(lpFreq); for (int i = 0; i < times; ++i) { ::QueryPerformanceCounter(lpStart); func(); ::QueryPerformanceCounter(lpEnd); result[i] = (lpEnd[0].QuadPart - lpStart[0].QuadPart) * ONE_MILLION / double(lpFreq[0].QuadPart); } nth_element(result.begin(), result.begin() + (times / 2), result.end()); return result[times / 2]; #else struct timeval start, stop, diff; vector<double> result(times); memset(&start, 0, sizeof(struct timeval)); memset(&stop, 0, sizeof(struct timeval)); memset(&diff, 0, sizeof(struct timeval)); for (int i = 0; i < times; ++i) { gettimeofday(&start, 0); func(); gettimeofday(&stop, 0); time_substract(&diff, &start, &stop); result[i] = (diff.tv_sec * 1000000 + double(diff.tv_usec)); } nth_element(result.begin(), result.begin() + (times / 2), result.end()); return result[times / 2]; #endif } size_t prepare_data(set<unsigned long long> &data, unsigned int size, bool strict = false) { data.clear(); for (unsigned int i = 0; i < size; i++) { data.insert(strict ? i * 3 : randint()); } return data.size(); } int main() { srand((unsigned int)time(NULL)); set<unsigned long long> a; set<unsigned long long> b; vector<unsigned long long> result(SIX_MILLION); double res; #ifdef TEST prepare_data(a, SIX_MILLION); #endif prepare_data(a, SIX_MILLION / 2, true); prepare_data(b, SIX_MILLION / 2); res = time_it([&a, &b, &result]() { auto iter = set_intersection(a.begin(), a.end(), b.begin(), b.end(), result.begin()); result.resize(iter - result.begin()); }); cout << "duration: " << res << " microseconds,set a size: " << a.size() << " set b size: " << b.size() << " set result size: " << result.size() << endl; return 0; }
ubuntu@host:~/test_intersection$ g++ -std=c++11 -O3 -DLINUX main1.cpp -o main1
ubuntu@host:~/test_intersection$ ./main1
duration: 62080 microseconds,set a size: 2998917 set b size: 3000000 set result size: 2087
ubuntu@host:~/test_intersection$ g++ -std=c++11 -O3 -DLINUX -DTEST main1.cpp -o main1
ubuntu@host:~/test_intersection$ ./main1
duration: 362546 microseconds,set a size: 2998985 set b size: 3000000 set result size: 2149
Advertisement
Answer
I get the same result as you on my Ubuntu droplet hosted at DigitalOcean. It has fairly limited RAM. Prior to running the test, it had about 220MB free (output of /usr/bin/free -tm
):
# free -tm total used free shared buff/cache available Mem: 493 153 220 14 119 298 Swap: 0 0 0 Total: 493 153 220
When I run the slow test, I can watch the available memory get completely soaked up.
# free -tm total used free shared buff/cache available Mem: 493 383 10 14 99 69 Swap: 0 0 0 Total: 493 383 10
Just in case the clear()
method kept all that memory reserved internally, I tried instead:
data = std::move( std::set<unsigned long long>() );
But this made little difference.
So one of my original suspicions is that you have fragmented your memory by exhausting it with a data structure like std::set
, which performs lots of allocations to build a tree and then frees them in an unspecified order (due to the arrangement of nodes in the tree).
To simulate this, I replaced the TEST
section with code that performed a lot of allocations and then released them in a different order (by stepping over the list using a prime number stride).
#ifdef TEST //prepare_data(a, SIX_MILLION); { std::vector<void*> mem(SIX_MILLION); for( auto & val : mem ) val = malloc(24); for( int i=0, p=0, step=499739; i < SIX_MILLION; i++) { p = (p + step ) % SIX_MILLION; free(mem[p]); } } #endif
Allocations of 24 bytes were sufficient to stress the memory allocator on my system, leading to similar results to those you have described. I found that if I free the values in a more predictable order (i.e. walking through from first to last), this did not have the same effect on performance.
So I would say the final explanation for this is that you are a victim of memory fragmentation. You filled up your memory with lots of small allocations, and then freed them in a random order. You then built new data sets, which suffered from poor cache locality because the allocation system was stretched. This had a measurably severe impact on performance when it came to calculate an expensive intersection of these two data sets.