Java PDFBox – Read & Crawl PDF Text Content

| /Lib/PDFBoxHelper.java |
|
package lib; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PDFBoxHelper { private File file; private PDFParser parser; private PDFTextStripper stripper; private PDDocument pdoc; private COSDocument cdoc; public PDFBoxHelper(String FileName) { this.initialize(FileName); } private void initialize(String FileName) { try { this.file = new File(FileName); this.parser = new PDFParser(new RandomAccessFile(file, "r")); this.parser.parse(); this.cdoc = parser.getDocument(); this.stripper = new PDFTextStripper(); this.pdoc = new PDDocument(cdoc); } catch (IOException e) { e.printStackTrace(); } } public int getHashCode() { return this.parser.hashCode(); } public int getNumberOfPages() { return this.pdoc.getNumberOfPages(); } public String getFileAllText() { StringBuilder text = new StringBuilder(); try { for(int i = 0; i < this.getNumberOfPages(); i++) { stripper.setStartPage(i); stripper.setEndPage(i+1); text.append(stripper.getText(pdoc)); //System.out.println(text); } return text.toString(); } catch (IOException e) { e.printStackTrace(); return ""; } } public void Dispose() { try { this.pdoc.close(); this.cdoc.close(); } catch (IOException e) { e.printStackTrace(); } } } |
| /Main.java |
|
import lib.PDFBoxHelper; public class Main { public static void main(String[] args) { PDFBoxHelper helper = new PDFBoxHelper("C:\\PDF\\Oracle Solaris 11- First Look.pdf"); System.out.println(helper.getFileAllText()); System.out.println(helper.getNumberOfPages()); System.out.println(helper.getHashCode()); helper.Dispose(); } } |