从pdf文件中提取文本

我需要从pdf文件中提取文字(逐字逐句)。

import java.io.*; import com.itextpdf.text.*; import com.itextpdf.text.pdf.*; import com.itextpdf.text.pdf.parser.*; public class pdf { private static String INPUTFILE = "http://ontology.buffalo.edu/ontology%28PIC%29.pdf" ; private static String OUTPUTFILE = "c:/new3.pdf"; public static void main(String[] args) throws DocumentException, IOException { Document document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(OUTPUTFILE)); document.open(); PdfReader reader = new PdfReader(INPUTFILE); int n = reader.getNumberOfPages(); PdfImportedPage page; // Go through all pages for (int i = 1; i <= n; i++) { page = writer.getImportedPage(reader, i); System.out.println(i); Image instance = Image.getInstance(page); document.add(instance); } document.close(); PdfReader readerN = new PdfReader(OUTPUTFILE); PdfTextExtractor parse = new PdfTextExtractor(); for (int i = 1; i <= n; i++) System.out.println(parser.getTextFromPage(reader,i)); } 

当我编译代码时,我有这个错误:

构造函数PdfTextExtractor未定义

我该如何解决?

PDFTextExtractor只包含静态方法,构造函数是私有的。 iText的

你可以像这样调用它:
String myLine = PDFTextExtractor.getTextFromPage(reader, pageNumber)

如果要从PDF文件中获取所有文本并将其保存到文本文件,可以使用下面的代码。

使用pdfutil.jar库。

 import java.io.IOException; import java.io.PrintWriter; import com.testautomationguru.utility.PDFUtil; public class PDFToText{ public static void main(String[] args) { try { String pdfFilePath = "C:\\abc.pdf"; PDFUtil pdfUtil = new PDFUtil(); String content = pdfUtil.getText(pdfFilePath); PrintWriter out = new PrintWriter("C:\\abc.txt"); out.println(content); out.close(); } catch (IOException e) { e.printStackTrace(); } } } 
 // Try Apache PDF Box import java.io.FilterInputStream; import java.io.InputStream; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; // Your PDF file String filePath = ""; InputStream inputStream = null; try { inputStream = new FileInputStream(filePath); PDFParser parser = new PDFParser(inputStream); // This will parse the stream and populate the COSDocument object. parser.parse(); // Get the document that was parsed. COSDocument cosDoc = parser.getDocument(); // This class will take a pdf document and strip out all of the text and // ignore the formatting and such. PDFTextStripper pdfStripper = new PDFTextStripper(); // This is the in-memory representation of the PDF document PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); // This will return the text of a document. def statementPDF = pdfStripper.getText(pdDoc); } catch(Exception e) { String errorMessage += "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage(); for (trace in e.getStackTrace()) { errorMessage += "\n\t" + trace; } } finally { if (inputStream != null) { inputStream.close(); } }