Why use the precompiler to add a line of code, the speed change is so great?
#include <stdio.h>
#include <time.h>
#include <set>
#include <vector>
#include <list>
#include <iostream>
#include <algorithm>
#include <functional>
#ifndef LINUX
#include <windows.h>
#else
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#endif
using namespace std;
const int SIX_MILLION = 6000000;
const int ONE_MILLION = 1000000;
unsigned long long randint()
{
return (unsigned long long)(rand() & 0xFFFF) << 16 | (unsigned long long)(rand() & 0xFFFF);
}
#ifdef LINUX
int time_substract(struct timeval *result, struct timeval *begin, struct timeval *end)
{
if (begin->tv_sec > end->tv_sec) return -1;
if ((begin->tv_sec == end->tv_sec) && (begin->tv_usec > end->tv_usec)) return -2;
result->tv_sec = (end->tv_sec - begin->tv_sec);
result->tv_usec = (end->tv_usec - begin->tv_usec);
if (result->tv_usec < 0)
{
result->tv_sec--;
result->tv_usec += ONE_MILLION;
}
return 0;
}
#endif
double time_it(function<void()> func, int times = 3)
{
#ifndef LINUX
LARGE_INTEGER lpStart[1], lpEnd[1], lpFreq[1];
vector<double> result(times);
::QueryPerformanceFrequency(lpFreq);
for (int i = 0; i < times; ++i) {
::QueryPerformanceCounter(lpStart);
func();
::QueryPerformanceCounter(lpEnd);
result[i] = (lpEnd[0].QuadPart - lpStart[0].QuadPart) * ONE_MILLION / double(lpFreq[0].QuadPart);
}
nth_element(result.begin(), result.begin() + (times / 2), result.end());
return result[times / 2];
#else
struct timeval start, stop, diff;
vector<double> result(times);
memset(&start, 0, sizeof(struct timeval));
memset(&stop, 0, sizeof(struct timeval));
memset(&diff, 0, sizeof(struct timeval));
for (int i = 0; i < times; ++i) {
gettimeofday(&start, 0);
func();
gettimeofday(&stop, 0);
time_substract(&diff, &start, &stop);
result[i] = (diff.tv_sec * 1000000 + double(diff.tv_usec));
}
nth_element(result.begin(), result.begin() + (times / 2), result.end());
return result[times / 2];
#endif
}
size_t prepare_data(set<unsigned long long> &data, unsigned int size, bool strict = false) {
data.clear();
for (unsigned int i = 0; i < size; i++) {
data.insert(strict ? i * 3 : randint());
}
return data.size();
}
int main()
{
srand((unsigned int)time(NULL));
set<unsigned long long> a;
set<unsigned long long> b;
vector<unsigned long long> result(SIX_MILLION);
double res;
#ifdef TEST
prepare_data(a, SIX_MILLION);
#endif
prepare_data(a, SIX_MILLION / 2, true);
prepare_data(b, SIX_MILLION / 2);
res = time_it([&a, &b, &result]() {
auto iter = set_intersection(a.begin(), a.end(), b.begin(), b.end(), result.begin());
result.resize(iter - result.begin());
});
cout << "duration: " << res << " microseconds,set a size: " <<
a.size() << " set b size: " << b.size() << " set result size: " << result.size() << endl;
return 0;
}
ubuntu@host:~/test_intersection$ g++ -std=c++11 -O3 -DLINUX main1.cpp -o main1
ubuntu@host:~/test_intersection$ ./main1
duration: 62080 microseconds,set a size: 2998917 set b size: 3000000 set result size: 2087
ubuntu@host:~/test_intersection$ g++ -std=c++11 -O3 -DLINUX -DTEST main1.cpp -o main1
ubuntu@host:~/test_intersection$ ./main1
duration: 362546 microseconds,set a size: 2998985 set b size: 3000000 set result size: 2149
Advertisement
Answer
I get the same result as you on my Ubuntu droplet hosted at DigitalOcean. It has fairly limited RAM. Prior to running the test, it had about 220MB free (output of /usr/bin/free -tm):
# free -tm
total used free shared buff/cache available
Mem: 493 153 220 14 119 298
Swap: 0 0 0
Total: 493 153 220
When I run the slow test, I can watch the available memory get completely soaked up.
# free -tm
total used free shared buff/cache available
Mem: 493 383 10 14 99 69
Swap: 0 0 0
Total: 493 383 10
Just in case the clear() method kept all that memory reserved internally, I tried instead:
data = std::move( std::set<unsigned long long>() );
But this made little difference.
So one of my original suspicions is that you have fragmented your memory by exhausting it with a data structure like std::set, which performs lots of allocations to build a tree and then frees them in an unspecified order (due to the arrangement of nodes in the tree).
To simulate this, I replaced the TEST section with code that performed a lot of allocations and then released them in a different order (by stepping over the list using a prime number stride).
#ifdef TEST
//prepare_data(a, SIX_MILLION);
{
std::vector<void*> mem(SIX_MILLION);
for( auto & val : mem ) val = malloc(24);
for( int i=0, p=0, step=499739; i < SIX_MILLION; i++) {
p = (p + step ) % SIX_MILLION;
free(mem[p]);
}
}
#endif
Allocations of 24 bytes were sufficient to stress the memory allocator on my system, leading to similar results to those you have described. I found that if I free the values in a more predictable order (i.e. walking through from first to last), this did not have the same effect on performance.
So I would say the final explanation for this is that you are a victim of memory fragmentation. You filled up your memory with lots of small allocations, and then freed them in a random order. You then built new data sets, which suffered from poor cache locality because the allocation system was stretched. This had a measurably severe impact on performance when it came to calculate an expensive intersection of these two data sets.