How can i strip the HTML from document between and including the <…> tags in a HTML document using C? My current program uses curl to get the contents of the webpage and puts it into a text file, it then reads from the text file and removes the <>, but i am unsure of how to remove everything between those tags.
#include <curl/curl.h> #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <unistd.h> #define WEBPAGE_URL "http://homepages.paradise.net.nz/adrianfu/index.html" #define DESTINATION_FILE "/home/user/data.txt" size_t write_data( void *ptr, size_t size, size_t nmeb, void *stream) { return fwrite(ptr,size,nmeb,stream); } int main() { int in_tag = 0; char * buffer; char c; long lSize; size_t result; FILE * file = fopen(DESTINATION_FILE,"w+"); if (file==NULL) { fputs ("File error",stderr); exit (1); } CURL *handle = curl_easy_init(); curl_easy_setopt(handle,CURLOPT_URL,WEBPAGE_URL); /*Using the http protocol*/ curl_easy_setopt(handle,CURLOPT_WRITEFUNCTION, write_data); curl_easy_setopt(handle,CURLOPT_WRITEDATA, file); curl_easy_perform(handle); curl_easy_cleanup(handle); int i, nRead, fd; int source; char buf[1024]; if((fd = open("data.txt", O_RDONLY)) == -1) { printf("Cannot open the file"); } else { nRead = read(fd, buf, 1024); printf("Original String "); for(i=0; i<nRead; i++) { printf("%c", buf[i]); } printf("nReplaced String "); for(i=0; i<nRead; i++) { if(buf[i]=='<' || buf[i]=='>'){ buf[i]=' '; } printf("%c", buf[i]); } } close(source); return 0; }
Advertisement
Answer
Placing just the code that removes the contents between the ‘<‘ and ‘>’ tags (assuming that you deal with proper html, meaning that you don’t have one tag nested in the declaration of the other like <html < body> >
). I am just changing a small portion of your code. I will also remove the tags from the buf
variable, instead of replacing the undesired characters with intervals, because I think this will be more useful to you (correct me if I am wrong).
int idx = 0; int opened = 0; // false for(i=0; i<nRead; i++) { if(buf[i]=='<') { opened = 1; // true } else if (buf[i] == '>') { opened = 0; // false } else if (!opened) { buf[idx++] = buf[i]; } } buf[idx] = ''; printf("%sn", buf);