PDFBox

Java PDFBox – Read & Crawl PDF Text Content

Java PDFBox – Read & Crawl PDF Text Content

   /Lib/PDFBoxHelper.java
 
   package lib;
 
   import java.io.File;
   import java.io.FileNotFoundException;
   import java.io.IOException;
 
   import org.apache.pdfbox.cos.COSDocument;
   import org.apache.pdfbox.io.RandomAccessFile;
   import org.apache.pdfbox.pdfparser.PDFParser;
   import org.apache.pdfbox.pdmodel.PDDocument;
   import org.apache.pdfbox.text.PDFTextStripper;
 
   public class PDFBoxHelper
   {
      private File file;
      private PDFParser parser;
      private PDFTextStripper stripper;
      private PDDocument pdoc;
      private COSDocument cdoc;
 
      public PDFBoxHelper(String FileName)
      {
         this.initialize(FileName);
      }
 
      private void initialize(String FileName)
      {
         try
         {
            this.file = new File(FileName);
            this.parser = new PDFParser(new RandomAccessFile(file, "r"));
            this.parser.parse();
            this.cdoc = parser.getDocument();
            this.stripper = new PDFTextStripper();
            this.pdoc = new PDDocument(cdoc);
         } catch (IOException e) {
            e.printStackTrace();
         }
      }
 
      public int getHashCode()
      {
         return this.parser.hashCode();
      }
 
      public int getNumberOfPages()
      {
         return this.pdoc.getNumberOfPages();
      }
 
      public String getFileAllText()
      {
         StringBuilder text = new StringBuilder();
 
         try
         {
            for(int i = 0; i < this.getNumberOfPages(); i++)
            {
               stripper.setStartPage(i);
               stripper.setEndPage(i+1);
               text.append(stripper.getText(pdoc));
               //System.out.println(text);
            }
 
            return text.toString();
 
         } catch (IOException e) {
            e.printStackTrace();
            return "";
         }
      }
 
      public void Dispose()
      {
         try
         {
            this.pdoc.close();
            this.cdoc.close();
         } catch (IOException e) {
            e.printStackTrace();
         }
      }
   }
 
   /Main.java
 
   import lib.PDFBoxHelper;
 
   public class Main
   {
      public static void main(String[] args)
      {
         PDFBoxHelper helper = new PDFBoxHelper("C:\\PDF\\Oracle Solaris 11- First Look.pdf");
 
         System.out.println(helper.getFileAllText());
         System.out.println(helper.getNumberOfPages());
         System.out.println(helper.getHashCode());
 
         helper.Dispose();
      }
   }