使用PDFbox确定文档中单词的坐标

我正在使用PDFbox提取PDF文档中单词/字符串的坐标,并且到目前为止已成功确定单个字符的位置。 这是迄今为止的代码,来自PDFbox doc:

package printtextlocations; import java.io.*; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; import java.io.IOException; import java.util.List; public class PrintTextLocations extends PDFTextStripper { public PrintTextLocations() throws IOException { super.setSortByPosition(true); } public static void main(String[] args) throws Exception { PDDocument document = null; try { File input = new File("C:\\path\\to\\PDF.pdf"); document = PDDocument.load(input); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PrintTextLocations printer = new PrintTextLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + i); PDStream contents = page.getContents(); if (contents != null) { printer.processStream(page, page.findResources(), page.getContents().getStream()); } } } finally { if (document != null) { document.close(); } } } /** * @param text The text to be processed */ @Override /* this is questionable, not sure if needed... */ protected void processTextPosition(TextPosition text) { System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getCharacter()); } } 

这会产生一系列包含每个字符位置的行,包括空格,如下所示:

 String[202.5604,41.880127 fs=1.0 xscale=13.98 height=9.68814 space=3.8864403 width=9.324661]P 

其中’P’是角色。 我无法在PDFbox中找到一个函数来查找单词,而且我对Java不熟悉,能够准确地将这些字符连接成单词以进行搜索,即使这些空格也包含在内。 有没有其他人处于类似的情况,如果是这样,你是如何接近它的? 我真的只需要单词中第一个字符的坐标,以便简化部分,但是我将如何匹配字符串与这种输出相匹配的情况超出了我的范围。

PDFBox中没有允许您自动提取单词的function。 我正在努力提取数据以将其收集到块中,这是我的过程:

  1. 我提取文档的所有字符(称为字形)并将它们存储在列表中。

  2. 我对每个字形的坐标进行分析,循环遍历列表。 如果它们重叠(如果当前字形的顶部包含在当前字形的前面/底部的顶部和底部之间,则包含在前一个字体的顶部和底部之间),我将它添加到同一行。

  3. 此时,我已经提取了文档的不同行(注意,如果您的文档是多列,则表达“行”表示垂直重叠的所有字形,即具有相同垂直的所有列的文本坐标)。

  4. 然后,您可以将当前字形的左坐标与前一个字符的右坐标进行比较,以确定它们是否属于同一个字(PDFTextStripper类提供了一个getSpacingTolerance()方法,根据试验和错误提供给您,“正常”空间的值。如果右坐标和左坐标之间的差值低于此值,则两个字形属于同一个字。

我将这种方法应用到我的工作中并且效果很好。

基于最初的想法,这里是PDFBox 2的文本搜索版本。代码本身很粗糙,但很简单。 它应该让你很快开始。

 import java.io.IOException; import java.io.Writer; import java.util.List; import java.util.Set; import lu.abac.pdfclient.data.PDFTextLocation; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; public class PrintTextLocator extends PDFTextStripper { private final Set locations; public PrintTextLocator(PDDocument document, Set locations) throws IOException { super.setSortByPosition(true); this.document = document; this.locations = locations; this.output = new Writer() { @Override public void write(char[] cbuf, int off, int len) throws IOException { } @Override public void flush() throws IOException { } @Override public void close() throws IOException { } }; } public Set doSearch() throws IOException { processPages(document.getDocumentCatalog().getPages()); return locations; } @Override protected void writeString(String text, List textPositions) throws IOException { super.writeString(text); String searchText = text.toLowerCase(); for (PDFTextLocation textLoc:locations) { int start = searchText.indexOf(textLoc.getText().toLowerCase()); if (start!=-1) { // found TextPosition pos = textPositions.get(start); textLoc.setFound(true); textLoc.setPage(getCurrentPageNo()); textLoc.setX(pos.getXDirAdj()); textLoc.setY(pos.getYDirAdj()); } } } } 

看看这个,我认为这就是你需要的。

https://jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/

这是代码:

 import java.io.File; import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; public class PrintTextLocations extends PDFTextStripper { public static StringBuilder tWord = new StringBuilder(); public static String seek; public static String[] seekA; public static List wordList = new ArrayList(); public static boolean is1stChar = true; public static boolean lineMatch; public static int pageNo = 1; public static double lastYVal; public PrintTextLocations() throws IOException { super.setSortByPosition(true); } public static void main(String[] args) throws Exception { PDDocument document = null; seekA = args[1].split(","); seek = args[1]; try { File input = new File(args[0]); document = PDDocument.load(input); if (document.isEncrypted()) { try { document.decrypt(""); } catch (InvalidPasswordException e) { System.err.println("Error: Document is encrypted with a password."); System.exit(1); } } PrintTextLocations printer = new PrintTextLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); PDStream contents = page.getContents(); if (contents != null) { printer.processStream(page, page.findResources(), page.getContents().getStream()); } pageNo += 1; } } finally { if (document != null) { System.out.println(wordList); document.close(); } } } @Override protected void processTextPosition(TextPosition text) { String tChar = text.getCharacter(); System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + "]" + text.getCharacter()); String REGEX = "[,.\\[\\](:;!?)/]"; char c = tChar.charAt(0); lineMatch = matchCharLine(text); if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) { if ((!is1stChar) && (lineMatch == true)) { appendChar(tChar); } else if (is1stChar == true) { setWordCoord(text, tChar); } } else { endWord(); } } protected void appendChar(String tChar) { tWord.append(tChar); is1stChar = false; } protected void setWordCoord(TextPosition text, String tChar) { tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar); is1stChar = false; } protected void endWord() { String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", ""); String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1); if (!"".equals(sWord)) { if (Arrays.asList(seekA).contains(sWord)) { wordList.add(newWord); } else if ("SHOWMETHEMONEY".equals(seek)) { wordList.add(newWord); } } tWord.delete(0, tWord.length()); is1stChar = true; } protected boolean matchCharLine(TextPosition text) { Double yVal = roundVal(Float.valueOf(text.getYDirAdj())); if (yVal.doubleValue() == lastYVal) { return true; } lastYVal = yVal.doubleValue(); endWord(); return false; } protected Double roundVal(Float yVal) { DecimalFormat rounded = new DecimalFormat("0.0'0'"); Double yValDub = new Double(rounded.format(yVal)); return yValDub; } } 

依赖关系:

PDFBox,FontBox,Apache通用日志记录界面。

您可以通过在命令行上键入来运行它:

 javac PrintTextLocations.java sudo java PrintTextLocations file.pdf WORD1,WORD2,.... 

输出类似于:

 [(1)[190.3 : 286.8] WORD1, (1)[283.3 : 286.8] WORD2, ...] 
 import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.pdfbox.text.PDFTextStripperByArea; public class SearchInPdf { public static void main(String[] args) throws InvalidPasswordException, IOException { PDDocument document = PDDocument.load(new File("sample2.pdf")); System.out.println(search("banq", document)); } public static int search(String name, PDDocument document) throws IOException { int page = 0; int y = 826; // start searching from top of pdf while (y>0) { int[] cb = { 20, y, 70, 16 }; // her I just searching in a culomn of 70 String text = getData(page, cb, document); // her I get the text text = text.replaceAll(" ", "").toLowerCase(); String data = text.substring(0, Math.min(text.length(), 4));// Get first 4 chars in text if(data.equals(name)) {// test if it's egal to the word I want break; } y -= 10; } return y; } public static String getData(int page, int[] cord, PDDocument document) throws IOException { PDFTextStripperByArea textStripper = new PDFTextStripperByArea(); Rectangle2D rect = new Rectangle2D.Float(cord[0], cord[1], cord[2], cord[3]); textStripper.addRegion("region", rect); textStripper.setSortByPosition(true); PDPage docPage = document.getPage(page); textStripper.extractRegions(docPage); String textForRegion = textStripper.getTextForRegion("region"); textForRegion = textForRegion.replaceAll("^\\s+|\\s+$", ""); return textForRegion; } }