Linux Question

I’m stumped. This script works great in Eclipse on my windows machine, if I hard code the file paths. If I try to take in arguments and run it on my edge node (a linux box), it throws no particular errors but it just leaves an empty output file. I must be missing something stupid, but I am not seeing it. Anyone have any idea what’s going on?

package com.trv.cbia.de.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;


public class DocParser {

public Map<String, Object> processRecord(String path) {
    Map<String, Object> map = new HashMap<String, Object>();
    String docPath = path;
    try{
            BodyContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            InputStream inputstream = new FileInputStream(new File(docPath));
            ParseContext pcontext = new ParseContext();
            Parser pdfparser  = new AutoDetectParser();
            pdfparser.parse(inputstream, handler, metadata, pcontext);
            map.put("text", handler.toString().replaceAll("n|r|t", " "));
             map.put("title", metadata.get(TikaCoreProperties.TITLE));
             map.put("pageCount", metadata.get("xmpTPg:NPages"));

    } catch (IOException ex){
         System.out.println("Caught IOException:" + ex.getMessage());
    }
    catch(TikaException tx) {
         System.out.println("Caught TikaException: " + tx.getMessage());
    }
    catch(SAXException sx){

         System.out.println("Caught SAXException: " + sx.getMessage());

        }

    return map;
}

 public static void main(String args[]){


    String file = args[0];
    String out =  args[1];
    DocParser textExtract = new DocParser();
    Map<String, Object> extractedMap = textExtract.processRecord(file);
    try {
        PrintWriter writer = new PrintWriter(out,"UTF-8");
        writer.println(extractedMap.get("text"));
        writer.flush();
        writer.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    }

}

}

JavaScript
​x
 
package com.trv.cbia.de.tika;​import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.PrintWriter;import java.io.UnsupportedEncodingException;​import org.apache.tika.exception.TikaException;import org.apache.tika.metadata.Metadata;import org.apache.tika.metadata.TikaCoreProperties;import org.apache.tika.parser.AutoDetectParser;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.Parser;import org.apache.tika.sax.BodyContentHandler;import org.xml.sax.SAXException;​import java.io.InputStream;import java.util.HashMap;import java.util.Map;​​public class DocParser {​public Map<String, Object> processRecord(String path) {    Map<String, Object> map = new HashMap<String, Object>();    String docPath = path;    try{            BodyContentHandler handler = new BodyContentHandler();            Metadata metadata = new Metadata();            InputStream inputstream = new FileInputStream(new File(docPath));            ParseContext pcontext = new ParseContext();            Parser pdfparser  = new AutoDetectParser();            pdfparser.parse(inputstream, handler, metadata, pcontext);            map.put("text", handler.toString().replaceAll("n|r|t", " "));             map.put("title", metadata.get(TikaCoreProperties.TITLE));             map.put("pageCount", metadata.get("xmpTPg:NPages"));​    } catch (IOException ex){         System.out.println("Caught IOException:" + ex.getMessage());    }    catch(TikaException tx) {         System.out.println("Caught TikaException: " + tx.getMessage());    }    catch(SAXException sx){​         System.out.println("Caught SAXException: " + sx.getMessage());​        }​    return map;}​ public static void main(String args[]){​​    String file = args[0];    String out =  args[1];    DocParser textExtract = new DocParser();    Map<String, Object> extractedMap = textExtract.processRecord(file);    try {        PrintWriter writer = new PrintWriter(out,"UTF-8");        writer.println(extractedMap.get("text"));        writer.flush();        writer.close();    } catch (FileNotFoundException e) {        e.printStackTrace();    } catch (UnsupportedEncodingException e) {        e.printStackTrace();    }​}​}​

Answer

Ended up being that I needed to add tika-app-1.13.jar to my classpath. It never divulged any classpath errors. I had to dig through a bunch of the apache mailing list to find someone with a similar problem. Posting the solution here in case anyone else runs across it.

Apache Tika – PrintWriter works on local Windows machine but not Linux box

Advertisement

Answer