I write three different codes to copy data from a 4GB buffer to another 4GB buffer. I measure their bandwidth and the cache miss with perf stat. The code is shown below: Compiling it with gcc memcpy-test.c -o memcpy-test. The first one uses memcpy to copy memcpy_sz bytes data for each time. I test this with 8B, 64B, 4KB, 512KB,

why memcpy is slower than copying data in bytes granularity?

I write three different codes to copy data from a 4GB buffer to another 4GB buffer. I measure their bandwidth and the cache miss with perf stat. The code is shown below:

#include<stdio.h>
#include<stdlib.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#define GB (1UL << 30)
#define MB (1UL << 20)
#define BUF (4*GB)
#define TIMES (10)
#define CACHELINE 64
int main(int argc, char** argv){
    int flag = atoi(argv[1]);
    int memcpy_sz = 0;
    if(argc>2)
        memcpy_sz = atoi(argv[2]);
    char *a = (char*)aligned_alloc(64,BUF);
    char *b = (char*)aligned_alloc(64,BUF);
    memset(a,1,BUF);
    memset(b,20,BUF);
    unsigned long i = 0,j;
    struct timespec before, after;
    clock_gettime(CLOCK_MONOTONIC, &before);
    if(flag==1)
    { // memcpy
        for(j=0;j<TIMES;j++){
            size_t *ap  = (size_t*)a;
            size_t *bp  = (size_t*)b;
            for(i=0;i<BUF;i+=memcpy_sz){
                memcpy(a+(i%BUF), b+(i%BUF), memcpy_sz);
            }
        }
    }else if(flag==2)
    { // 8byte loop
        for(j=0;j<TIMES;j++){
            size_t *ap  = (size_t*)a;
            size_t *bp  = (size_t*)b;
            for(i=0;i<BUF/CACHELINE;i++){
                ap[i] = bp[i];
            }
        }
    }else
    {  // 8x8byte loop
        size_t xlen = BUF / CACHELINE;
        for(j=0;j<TIMES;j++){
            size_t *ap  = (size_t*)a;
            size_t *bp  = (size_t*)b;
            while(xlen>0){
                ap[0] = bp[0];
                ap[1] = bp[1];
                ap[2] = bp[2];
                ap[3] = bp[3];
                ap[4] = bp[4];
                ap[5] = bp[5];
                ap[6] = bp[6];
                ap[7] = bp[7];
                ap += 8;
                bp += 8;
                xlen -=1;
            }
        }
    }

    clock_gettime(CLOCK_MONOTONIC, &after);
    double elapse = (after.tv_sec - before.tv_sec)+(after.tv_nsec - before.tv_nsec)/1e9;
    printf("time = %f s , bw = %.2f GB/sn", elapse, (1.0)/(elapse/TIMES));
    return 0;
}

JavaScript
​x
 
#include<stdio.h>#include<stdlib.h>#include <string.h>#include <time.h>#include <assert.h>#define GB (1UL << 30)#define MB (1UL << 20)#define BUF (4*GB)#define TIMES (10)#define CACHELINE 64int main(int argc, char** argv){    int flag = atoi(argv[1]);    int memcpy_sz = 0;    if(argc>2)        memcpy_sz = atoi(argv[2]);    char *a = (char*)aligned_alloc(64,BUF);    char *b = (char*)aligned_alloc(64,BUF);    memset(a,1,BUF);    memset(b,20,BUF);    unsigned long i = 0,j;    struct timespec before, after;    clock_gettime(CLOCK_MONOTONIC, &before);    if(flag==1)    { // memcpy        for(j=0;j<TIMES;j++){            size_t *ap  = (size_t*)a;            size_t *bp  = (size_t*)b;            for(i=0;i<BUF;i+=memcpy_sz){                memcpy(a+(i%BUF), b+(i%BUF), memcpy_sz);            }        }    }else if(flag==2)    { // 8byte loop        for(j=0;j<TIMES;j++){            size_t *ap  = (size_t*)a;            size_t *bp  = (size_t*)b;            for(i=0;i<BUF/CACHELINE;i++){                ap[i] = bp[i];            }        }    }else    {  // 8x8byte loop        size_t xlen = BUF / CACHELINE;        for(j=0;j<TIMES;j++){            size_t *ap  = (size_t*)a;            size_t *bp  = (size_t*)b;            while(xlen>0){                ap[0] = bp[0];                ap[1] = bp[1];                ap[2] = bp[2];                ap[3] = bp[3];                ap[4] = bp[4];                ap[5] = bp[5];                ap[6] = bp[6];                ap[7] = bp[7];                ap += 8;                bp += 8;                xlen -=1;            }        }    }​    clock_gettime(CLOCK_MONOTONIC, &after);    double elapse = (after.tv_sec - before.tv_sec)+(after.tv_nsec - before.tv_nsec)/1e9;    printf("time = %f s , bw = %.2f GB/sn", elapse, (1.0)/(elapse/TIMES));    return 0;}​

Compiling it with gcc memcpy-test.c -o memcpy-test.

The first one uses memcpy to copy memcpy_sz bytes data for each time. I test this with 8B, 64B, 4KB, 512KB, 1MB, 2MB, and 1GB memcpy_sz. The results are shown below:

> perf stat -e cache-misses ./memcpy-test 1 8
time = 14.375883 s , bw = 0.70 GB/s
1,406,999,198      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 64
time = 8.589959 s , bw = 1.16 GB/s
1,387,928,255      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 4096
time = 8.663083 s , bw = 1.15 GB/s
1,386,287,189      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 524288 (512kb)
time = 7.808415 s , bw = 1.28 GB/s
793,006,333      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 1048576  (1MB)
time = 7.787532 s , bw = 1.28 GB/s
804,767,063      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 2097152   (2MB)
time = 7.830964 s , bw = 1.28 GB/s 
795,495,638      cache-misses

> perf stat -e cache-misses ./memcpy-test 1 1073741824 （1GB）
time = 8.310286 s , bw = 1.20 GB/s
714,601,617      cache-misses

JavaScript
 
> perf stat -e cache-misses ./memcpy-test 1 8time = 14.375883 s , bw = 0.70 GB/s1,406,999,198      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 64time = 8.589959 s , bw = 1.16 GB/s1,387,928,255      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 4096time = 8.663083 s , bw = 1.15 GB/s1,386,287,189      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 524288 (512kb)time = 7.808415 s , bw = 1.28 GB/s793,006,333      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 1048576  (1MB)time = 7.787532 s , bw = 1.28 GB/s804,767,063      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 2097152   (2MB)time = 7.830964 s , bw = 1.28 GB/s 795,495,638      cache-misses​> perf stat -e cache-misses ./memcpy-test 1 1073741824 （1GB）time = 8.310286 s , bw = 1.20 GB/s714,601,617      cache-misses​

The second one is copying all data by coping 8 bytes for each time. It is much better than using memcpy.

> perf stat -e cache-misses ./memcpy-test 2
time = 1.548499 s , bw = 6.46 GB/s
191,120,848      cache-misses

JavaScript
 
> perf stat -e cache-misses ./memcpy-test 2time = 1.548499 s , bw = 6.46 GB/s191,120,848      cache-misses​

The third one is copying 64 bytes for each time which I learned from the glibc memset function and it is the fastest.

> perf stat -e cache-misses ./memcpy-test 3
time = 0.898856 s , bw = 11.13 GB/s
155,513,939      cache-misses

JavaScript
 
> perf stat -e cache-misses ./memcpy-test 3time = 0.898856 s , bw = 11.13 GB/s155,513,939      cache-misses​

I am very confused about this. There are two main questions for me.

As shown in the result, copying data in 64 bytes has the best performance which is not what I expected. I thought the performance of 64 bytes should perform the same as 8 bytes because CPU load data at 64 bytes granularity even for 8 bytes access.
Why the performance of memcpy is so slow and it makes much more cache misses. I read the code of the glibc memcpy and found that it calls __vm_copy for large memory copy. But I can’t find detail about __vm_copy.

PS: The CPU of my computer is Intel(R) Xeon(R) Gold 5220 CPU @ 2.20GHz. And the memory is Samsung DDR4 with 32GB and 2933 MT/s.

Answer

It is likely slower to use memcpy because the size of the chunk you’re copying isn’t known at compile time. Usually GCC will optimize calls to memcpy with known sizes into an appropriate type of copy. That could use any specific size and algorithm that the compiler thinks to be optimal, and it can and will adjust it based on any compiler optimization flags you specify.

For example, when copying object IDs, Git knows that they must either be 20 bytes in size (SHA-1) or 32 bytes in size (SHA-256), and specifically calls one of those two specialized values.

If I slightly modify your code (formatted as well), option 3 is the fastest on my system, since it uses a 128-byte chunk with memcpy. Both 2 and 4 perform similarly.

Note that in this case, I’ve made the exit code depend on the value of the data copied because otherwise the compiler will optimize out my memcpy calls since it determines that they’re useless. I’ve also added an additional option which involves copying the full buffer, which is slower. The other options copy the same portion of the buffer over and over again which means that the performance is much better, since much of the data stays in the CPU’s cache.

In general, unless you really know you need substantial memcpy performance, you should just use memcpy itself. If your data has a known size, do indeed use that, since that can help performance. However, in most cases, memcpy isn’t the bottleneck in your code, and therefore you shouldn’t optimize it until you’ve measured your code and determined that it’s the performance problem.

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define GB (1UL << 30)
#define MB (1UL << 20)
#define BUF (1 * GB)
#define TIMES (2)
#define CACHELINE 64
int main(int argc, char **argv) {
  assert(argc > 1);
  int flag = atoi(argv[1]);
  int memcpy_sz = 0;
  if (argc > 2)
    memcpy_sz = atoi(argv[2]);
  char *a = (char *)aligned_alloc(64, BUF);
  char *b = (char *)aligned_alloc(64, BUF);
  memset(a, 1, BUF);
  memset(b, 20, BUF);
  unsigned long i = 0, j;
  struct timespec before, after;
  clock_gettime(CLOCK_MONOTONIC, &before);
  if (flag == 1) { // memcpy
    for (j = 0; j < TIMES; j++) {
      size_t *ap = (size_t *)a;
      size_t *bp = (size_t *)b;
      for (i = 0; i < BUF; i += memcpy_sz) {
        memcpy(a + i, b + i, memcpy_sz);
      }
    }
  } else if (flag == 2) {
    for (j = 0; j < TIMES; j++) {
      size_t *ap = (size_t *)a;
      size_t *bp = (size_t *)b;
      for (i = 0; i < BUF / sizeof(size_t); i++) {
        ap[i] = bp[i];
      }
    }
  } else if (flag == 3) {
    for (j = 0; j < TIMES; j++) {
      for (i = 0; i < BUF / 128; i++) {
        memcpy(a + (i % BUF), b + (i % BUF), 128);
      }
    }
  } else if (flag == 4) {
    for (j = 0; j < TIMES; j++) {
      for (i = 0; i < BUF / 64; i++) {
        memcpy(a + (i % BUF), b + (i % BUF), 64);
      }
    }
  } else if (flag == 5) {
    for (j = 0; j < TIMES; j++) {
      memcpy(a, b, BUF);
    }
  } else {
    size_t xlen = BUF / CACHELINE;
    for (j = 0; j < TIMES; j++) {
      size_t *ap = (size_t *)a;
      size_t *bp = (size_t *)b;
      while (xlen > 0) {
        ap[0] = bp[0];
        ap[1] = bp[1];
        ap[2] = bp[2];
        ap[3] = bp[3];
        ap[4] = bp[4];
        ap[5] = bp[5];
        ap[6] = bp[6];
        ap[7] = bp[7];
        ap += 8;
        bp += 8;
        xlen -= 1;
      }
    }
  }

  clock_gettime(CLOCK_MONOTONIC, &after);
  double elapse =
      (after.tv_sec - before.tv_sec) + (after.tv_nsec - before.tv_nsec) / 1e9;
  printf("time = %f s , bw = %.2f GB/sn", elapse, (1.0) / (elapse / TIMES));
  return a[10] == 20;
}

JavaScript
 
#include <assert.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <time.h>#define GB (1UL << 30)#define MB (1UL << 20)#define BUF (1 * GB)#define TIMES (2)#define CACHELINE 64int main(int argc, char **argv) {  assert(argc > 1);  int flag = atoi(argv[1]);  int memcpy_sz = 0;  if (argc > 2)    memcpy_sz = atoi(argv[2]);  char *a = (char *)aligned_alloc(64, BUF);  char *b = (char *)aligned_alloc(64, BUF);  memset(a, 1, BUF);  memset(b, 20, BUF);  unsigned long i = 0, j;  struct timespec before, after;  clock_gettime(CLOCK_MONOTONIC, &before);  if (flag == 1) { // memcpy    for (j = 0; j < TIMES; j++) {      size_t *ap = (size_t *)a;      size_t *bp = (size_t *)b;      for (i = 0; i < BUF; i += memcpy_sz) {        memcpy(a + i, b + i, memcpy_sz);      }    }  } else if (flag == 2) {    for (j = 0; j < TIMES; j++) {      size_t *ap = (size_t *)a;      size_t *bp = (size_t *)b;      for (i = 0; i < BUF / sizeof(size_t); i++) {        ap[i] = bp[i];      }    }  } else if (flag == 3) {    for (j = 0; j < TIMES; j++) {      for (i = 0; i < BUF / 128; i++) {        memcpy(a + (i % BUF), b + (i % BUF), 128);      }    }  } else if (flag == 4) {    for (j = 0; j < TIMES; j++) {      for (i = 0; i < BUF / 64; i++) {        memcpy(a + (i % BUF), b + (i % BUF), 64);      }    }  } else if (flag == 5) {    for (j = 0; j < TIMES; j++) {      memcpy(a, b, BUF);    }  } else {    size_t xlen = BUF / CACHELINE;    for (j = 0; j < TIMES; j++) {      size_t *ap = (size_t *)a;      size_t *bp = (size_t *)b;      while (xlen > 0) {        ap[0] = bp[0];        ap[1] = bp[1];        ap[2] = bp[2];        ap[3] = bp[3];        ap[4] = bp[4];        ap[5] = bp[5];        ap[6] = bp[6];        ap[7] = bp[7];        ap += 8;        bp += 8;        xlen -= 1;      }    }  }​  clock_gettime(CLOCK_MONOTONIC, &after);  double elapse =      (after.tv_sec - before.tv_sec) + (after.tv_nsec - before.tv_nsec) / 1e9;  printf("time = %f s , bw = %.2f GB/sn", elapse, (1.0) / (elapse / TIMES));  return a[10] == 20;}​

Advertisement

Answer