Skip to content
Advertisement

Searching a text file backwards from the end

I’m trying to find the string containing the substring in a text file by starting at the end. The file has tens of millions of lines. (The requirement is to read from End of the File. I cannot use sed/awk/grep etc)

The below program does the job but it takes a long time. How can I make it run faster?

#include <stdio.h>
#include <stdlib.h>

char *strrev(char *str)
{
      char *p1, *p2;

      if (! str || ! *str)
            return str;
      for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2)
      {
            *p1 ^= *p2;
            *p2 ^= *p1;
            *p1 ^= *p2;
      }
      return str;
}

int main(int argc, char *argv[])
{
        FILE *fd;
        char len[100];

        int ch,count;
        int fpos=0;

        char *filename = argv[1];

        fd = fopen(filename, "r");
        fseek(fd, 0, SEEK_END);
        fpos = ftell(fd);

        while (fpos > 0)
        {
                fseek(fd, -2, SEEK_CUR);
                if(ftell(fd) <= 2)
                        break;

                ch = fgetc(fd);
                fpos--;
                count = 0;
                memset( len, 0, sizeof(len));
                while(ch != 'n')
                {
                        len[count++] = ch;
                        if(ftell(fd) < 2)
                                break;
                        fseek(fd, -2, SEEK_CUR);
                        ch =fgetc(fd);
                }
                strrev(len);
                if( strstr(len,"TEST") !=NULL )
                {
                        printf("Found test. n");
                        break;
                }
                //if(found)
                //do other stuff
        } 
        fclose(fd);
}

Advertisement

Answer

As noted by commentators, doing IO one character at a time (+ fseek + ftell) is very expensive. Remember that each time the code fgets the first character, the C library will ‘read ahead’ – reading BUFSIZ characters.

As an alternative, consider ‘read before’ logic, based on ideas provided above.

  • find out length of pattern pl
  • Allocate a buffer size of bs
  • Read blocks of bs backward. Each block will overlap with previous block for pl characters, eliminating the possibility that a match will occur in between blocks.
  • Assuming bs much larger the pl, cost of overlap reads is minimal, and performance close to optimal.

Error checks reduced to keep code clear.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#define   BS 100000
int main(int argc, char **argv)
{
   FILE *fp = fopen(argv[1], "r");
   if ( !fp ) { perror("fopen") ; exit (1) ; };

   fseek(fp, 0, SEEK_END) ;
   long pos = ftell(fp) ;
   fprintf(stderr, "S=%ldn", pos) ;

   const char *pattern = argv[2] ;
   const int PL = strlen(pattern) ;

       // Read backward
   char buff[BS+1] ;
   long match = -1;
   while ( pos > 0 ) {
       pos = pos - (BS-PL) ;
       if ( pos < 0 ) pos = 0 ;
       fseek(fp, pos, SEEK_SET) ;
       int n = fread(buff, sizeof(buff[0]), BS, fp) ;
       if ( n > 0 ) {
           buff[n] = 0 ;
           char *loc = strstr(buff, pattern) ;
           if ( loc ) ) { match = pos + (loc-buff) ; break ;} ;
       } ;
   } ;
   fclose(fp) ;
   printf("MATCH=%ldn", match) ;
}

Note that solution will only handle TEXT files. The ‘strstr’ will NOT work on data loaded from binary file which may contain NUL characters.

Updated 2019-11-12 to correct file position calculation when match is in first block.

User contributions licensed under: CC BY-SA
5 People found this is helpful
Advertisement