I’m trying to find the string containing the substring in a text file by starting at the end. The file has tens of millions of lines. (The requirement is to read from End of the File. I cannot use sed/awk/grep etc)
The below program does the job but it takes a long time. How can I make it run faster?
#include <stdio.h>
#include <stdlib.h>
char *strrev(char *str)
{
char *p1, *p2;
if (! str || ! *str)
return str;
for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2)
{
*p1 ^= *p2;
*p2 ^= *p1;
*p1 ^= *p2;
}
return str;
}
int main(int argc, char *argv[])
{
FILE *fd;
char len[100];
int ch,count;
int fpos=0;
char *filename = argv[1];
fd = fopen(filename, "r");
fseek(fd, 0, SEEK_END);
fpos = ftell(fd);
while (fpos > 0)
{
fseek(fd, -2, SEEK_CUR);
if(ftell(fd) <= 2)
break;
ch = fgetc(fd);
fpos--;
count = 0;
memset( len, 0, sizeof(len));
while(ch != 'n')
{
len[count++] = ch;
if(ftell(fd) < 2)
break;
fseek(fd, -2, SEEK_CUR);
ch =fgetc(fd);
}
strrev(len);
if( strstr(len,"TEST") !=NULL )
{
printf("Found test. n");
break;
}
//if(found)
//do other stuff
}
fclose(fd);
}
Advertisement
Answer
As noted by commentators, doing IO one character at a time (+ fseek + ftell) is very expensive. Remember that each time the code fgets the first character, the C library will ‘read ahead’ – reading BUFSIZ characters.
As an alternative, consider ‘read before’ logic, based on ideas provided above.
- find out length of pattern
pl - Allocate a buffer size of
bs - Read blocks of bs backward. Each block will overlap with previous block for
plcharacters, eliminating the possibility that a match will occur in between blocks. - Assuming bs much larger the pl, cost of overlap reads is minimal, and performance close to optimal.
Error checks reduced to keep code clear.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define BS 100000
int main(int argc, char **argv)
{
FILE *fp = fopen(argv[1], "r");
if ( !fp ) { perror("fopen") ; exit (1) ; };
fseek(fp, 0, SEEK_END) ;
long pos = ftell(fp) ;
fprintf(stderr, "S=%ldn", pos) ;
const char *pattern = argv[2] ;
const int PL = strlen(pattern) ;
// Read backward
char buff[BS+1] ;
long match = -1;
while ( pos > 0 ) {
pos = pos - (BS-PL) ;
if ( pos < 0 ) pos = 0 ;
fseek(fp, pos, SEEK_SET) ;
int n = fread(buff, sizeof(buff[0]), BS, fp) ;
if ( n > 0 ) {
buff[n] = 0 ;
char *loc = strstr(buff, pattern) ;
if ( loc ) ) { match = pos + (loc-buff) ; break ;} ;
} ;
} ;
fclose(fp) ;
printf("MATCH=%ldn", match) ;
}
Note that solution will only handle TEXT files. The ‘strstr’ will NOT work on data loaded from binary file which may contain NUL characters.
Updated 2019-11-12 to correct file position calculation when match is in first block.