(TL;DR) On NVME SSDs (Intel p3600 as well as Avant), I am seeing decrease in the IOPS if I issue random reads over a small subset of the disk instead of the entire disk.
While reading the same offset over and over, the IOPS are about 36-40K for 4k blocksize. The IOPS gradually increase as I grow the region over which random reads are being issued. The program (seen below) uses asynchronous IO on Linux to submit the read requests.
Disk Range(in 4k blocks), IOPS 0, 38833 1, 68596 10, 76100 30, 80381 40, 113647 50, 148205 100, 170374 200, 239798 400, 270197 800, 334767
OS : Linux 4.2.0-35-generic
SSD : Intel P3600 NVME Flash
What could be causing this problem ?
The program can be run as follows
$ for i in 0 1 10 30 40 50 100 200 400 800 do <program_name> /dev/nvme0n1 10 $i done
and validate if you also see the increasing pattern of IOPS seen above
/** * $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3 * $ progname /dev/nvme0n1 10 100 */ #include <random> #include <libaio.h> #include <stdlib.h>//malloc, exit #include <future> //async #include <unistd.h> //usleep #include <iostream> #include <sys/time.h> // gettimeofday #include <vector> #include <fcntl.h> // open #include <errno.h> #include <sys/types.h> // open #include <sys/stat.h> // open #include <cassert> #include <semaphore.h> io_context_t ioctx; std::vector<char*> buffers; int fd = -1; sem_t sem; constexpr int numPerRound = 20; constexpr int numRounds = 100000; constexpr int MAXEVENT = 10; constexpr size_t BLKSIZE = 4096; constexpr int QDEPTH = 200; off_t startBlock = 0; off_t numBlocks = 100; const int numSubmitted = numRounds * numPerRound; void DoGet() { io_event eventsArray[MAXEVENT]; int numCompleted = 0; while (numCompleted != numSubmitted) { bzero(eventsArray, MAXEVENT * sizeof(io_event)); int numEvents; do { numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr); } while (numEvents == -EINTR); for (int i = 0; i < numEvents; i++) { io_event* ev = &eventsArray[i]; iocb* cb = (iocb*)(ev->data); assert(ev->res2 == 0); assert(ev->res == BLKSIZE); sem_post(&sem); // free ioctx } numCompleted += numEvents; } std::cout << "completed=" << numCompleted << std::endl; } int main(int argc, char* argv[]) { if (argc == 1) { std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl; exit(1); } char* deviceName = argv[1]; startBlock = atoll(argv[2]); numBlocks = atoll(argv[3]); int ret = 0; ret = io_queue_init(QDEPTH, &ioctx); assert(ret == 0); ret = sem_init(&sem, 0, QDEPTH); assert(ret == 0); auto DoGetFut = std::async(std::launch::async, DoGet); // preallocate buffers for (int i = 0; i < QDEPTH; i++) { char* buf ; ret = posix_memalign((void**)&buf, 4096, BLKSIZE); assert(ret == 0); buffers.push_back(buf); } fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY); assert(fd >= 0); off_t offset = 0; struct timeval start; gettimeofday(&start, 0); std::mt19937 generator (getpid()); // generate random offsets within [startBlock, startBlock + numBlocks] std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks); for (int j = 0; j < numRounds; j++) { iocb mycb[numPerRound]; iocb* posted[numPerRound]; bzero(mycb, sizeof(iocb) * numPerRound); for (int i = 0; i < numPerRound; i++) { // same buffer may get used in 2 different async read // thats ok - not validating content in this program char* iobuf = buffers[i]; iocb* cb = &mycb[i]; offset = offsetgen(generator) * BLKSIZE; io_prep_pread(cb, fd, iobuf, BLKSIZE, offset); cb->data = iobuf; posted[i] = cb; sem_wait(&sem); // wait for ioctx to be free } int ret = 0; do { ret = io_submit(ioctx, numPerRound, posted); } while (ret == -EINTR); assert(ret == numPerRound); } DoGetFut.wait(); struct timeval end; gettimeofday(&end, 0); uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec); io_queue_release(ioctx); std::cout << "ops=" << numRounds * numPerRound << " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff << " region-size=" << (numBlocks * BLKSIZE) << std::endl; }
Advertisement
Answer
Surely it is to do with the structure of the memory. Internally this drive is built from many memory chips and may have multiple memory buses internally. If you do requests across a small range all the requests will resolve to a single or few chips and will have to be queued. If you access across the whole device then the multiple request are across many internal chips and buses and can be run asynchronously so will provide more throughput.