Decrease in Random read IOPs on NVME SSD if requests issued over small region

(TL;DR) On NVME SSDs (Intel p3600 as well as Avant), I am seeing decrease in the IOPS if I issue random reads over a small subset of the disk instead of the entire disk.

While reading the same offset over and over, the IOPS are about 36-40K for 4k blocksize. The IOPS gradually increase as I grow the region over which random reads are being issued. The program (seen below) uses asynchronous IO on Linux to submit the read requests.

Disk Range(in 4k blocks), IOPS 
0, 38833 
1, 68596 
10, 76100 
30, 80381 
40, 113647 
50, 148205 
100, 170374 
200, 239798 
400, 270197 
800, 334767

JavaScript
​x
 
Disk Range(in 4k blocks), IOPS 0, 38833 1, 68596 10, 76100 30, 80381 40, 113647 50, 148205 100, 170374 200, 239798 400, 270197 800, 334767​

OS : Linux 4.2.0-35-generic

SSD : Intel P3600 NVME Flash

What could be causing this problem ?

The program can be run as follows

$ for i in 0 1 10 30 40 50 100 200 400 800
do 
<program_name> /dev/nvme0n1 10 $i 
done

JavaScript
 
$ for i in 0 1 10 30 40 50 100 200 400 800do <program_name> /dev/nvme0n1 10 $i done​

and validate if you also see the increasing pattern of IOPS seen above

   /**
 * $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
 * $ progname /dev/nvme0n1 10 100
 */
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>

io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;

constexpr int numPerRound = 20;
constexpr int numRounds  = 100000;

constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;

off_t startBlock = 0;
off_t numBlocks = 100;

const int numSubmitted = numRounds * numPerRound;

void DoGet()
{
  io_event eventsArray[MAXEVENT];
  int numCompleted = 0;
  while (numCompleted != numSubmitted)
  {
    bzero(eventsArray, MAXEVENT * sizeof(io_event));
    int numEvents;
    do {
      numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
    } while (numEvents == -EINTR);

    for (int i = 0; i < numEvents; i++)
    {
      io_event* ev = &eventsArray[i];
      iocb* cb = (iocb*)(ev->data);
      assert(ev->res2 == 0);
      assert(ev->res == BLKSIZE);
      sem_post(&sem); // free ioctx
    }
    numCompleted += numEvents;
  }
  std::cout << "completed=" << numCompleted << std::endl;
}


int main(int argc, char* argv[])
{
  if (argc == 1) {
    std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
    exit(1);
  }

  char* deviceName = argv[1];
  startBlock = atoll(argv[2]);
  numBlocks = atoll(argv[3]);

  int ret = 0;
  ret = io_queue_init(QDEPTH, &ioctx);
  assert(ret == 0);
  ret = sem_init(&sem, 0, QDEPTH);
 assert(ret == 0);

  auto DoGetFut = std::async(std::launch::async, DoGet);

  // preallocate buffers
  for (int i = 0; i < QDEPTH; i++)
  {
    char* buf ;
    ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
    assert(ret == 0);
    buffers.push_back(buf);
  }

  fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
  assert(fd >= 0);

  off_t offset = 0;

  struct timeval start;
  gettimeofday(&start, 0);

  std::mt19937 generator (getpid());
  // generate random offsets within [startBlock, startBlock + numBlocks]
  std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);

  for (int j = 0; j < numRounds; j++)
  {
    iocb mycb[numPerRound];
    iocb* posted[numPerRound];

    bzero(mycb, sizeof(iocb) * numPerRound);

    for (int i = 0; i < numPerRound; i++)
    {
      // same buffer may get used in 2 different async read
      // thats ok - not validating content in this program
      char* iobuf = buffers[i];
      iocb* cb = &mycb[i];

       offset = offsetgen(generator) * BLKSIZE;

      io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
      cb->data = iobuf;
      posted[i] = cb;
      sem_wait(&sem); // wait for ioctx to be free
    }

    int ret = 0;
    do {
      ret = io_submit(ioctx, numPerRound, posted);
    } while (ret == -EINTR);

    assert(ret == numPerRound);
  }

  DoGetFut.wait();

  struct timeval end;
  gettimeofday(&end, 0);

  uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);

  io_queue_release(ioctx);

  std::cout
    << "ops=" << numRounds * numPerRound
    << " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
    << " region-size=" << (numBlocks * BLKSIZE)
    << std::endl;
}

JavaScript
 
   /** * $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3 * $ progname /dev/nvme0n1 10 100 */#include <random>#include <libaio.h>#include <stdlib.h>//malloc, exit#include <future> //async#include <unistd.h> //usleep#include <iostream>#include <sys/time.h> // gettimeofday#include <vector>#include <fcntl.h> // open#include <errno.h>#include <sys/types.h> // open#include <sys/stat.h> // open#include <cassert>#include <semaphore.h>​io_context_t ioctx;std::vector<char*> buffers;int fd = -1;sem_t sem;​constexpr int numPerRound = 20;constexpr int numRounds  = 100000;​constexpr int MAXEVENT = 10;constexpr size_t BLKSIZE = 4096;constexpr int QDEPTH = 200;​off_t startBlock = 0;off_t numBlocks = 100;​const int numSubmitted = numRounds * numPerRound;​void DoGet(){  io_event eventsArray[MAXEVENT];  int numCompleted = 0;  while (numCompleted != numSubmitted)  {    bzero(eventsArray, MAXEVENT * sizeof(io_event));    int numEvents;    do {      numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);    } while (numEvents == -EINTR);​    for (int i = 0; i < numEvents; i++)    {      io_event* ev = &eventsArray[i];      iocb* cb = (iocb*)(ev->data);      assert(ev->res2 == 0);      assert(ev->res == BLKSIZE);      sem_post(&sem); // free ioctx    }    numCompleted += numEvents;  }  std::cout << "completed=" << numCompleted << std::endl;}​​int main(int argc, char* argv[]){  if (argc == 1) {    std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;    exit(1);  }​  char* deviceName = argv[1];  startBlock = atoll(argv[2]);  numBlocks = atoll(argv[3]);​  int ret = 0;  ret = io_queue_init(QDEPTH, &ioctx);  assert(ret == 0);  ret = sem_init(&sem, 0, QDEPTH); assert(ret == 0);​  auto DoGetFut = std::async(std::launch::async, DoGet);​  // preallocate buffers  for (int i = 0; i < QDEPTH; i++)  {    char* buf ;    ret = posix_memalign((void**)&buf, 4096, BLKSIZE);    assert(ret == 0);    buffers.push_back(buf);  }​  fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);  assert(fd >= 0);​  off_t offset = 0;​  struct timeval start;  gettimeofday(&start, 0);​  std::mt19937 generator (getpid());  // generate random offsets within [startBlock, startBlock + numBlocks]  std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);​  for (int j = 0; j < numRounds; j++)  {    iocb mycb[numPerRound];    iocb* posted[numPerRound];​    bzero(mycb, sizeof(iocb) * numPerRound);​    for (int i = 0; i < numPerRound; i++)    {      // same buffer may get used in 2 different async read      // thats ok - not validating content in this program      char* iobuf = buffers[i];      iocb* cb = &mycb[i];​       offset = offsetgen(generator) * BLKSIZE;​      io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);      cb->data = iobuf;      posted[i] = cb;      sem_wait(&sem); // wait for ioctx to be free    }​    int ret = 0;    do {      ret = io_submit(ioctx, numPerRound, posted);    } while (ret == -EINTR);​    assert(ret == numPerRound);  }​  DoGetFut.wait();​  struct timeval end;  gettimeofday(&end, 0);​  uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);​  io_queue_release(ioctx);​  std::cout    << "ops=" << numRounds * numPerRound    << " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff    << " region-size=" << (numBlocks * BLKSIZE)    << std::endl;}​

Answer

Surely it is to do with the structure of the memory. Internally this drive is built from many memory chips and may have multiple memory buses internally. If you do requests across a small range all the requests will resolve to a single or few chips and will have to be queued. If you access across the whole device then the multiple request are across many internal chips and buses and can be run asynchronously so will provide more throughput.

Advertisement

Answer