IText阅读PDF格式如pdftotext -layout?

我正在寻找最简单的方法来实现一个类似于输出的安静的java解决方案

pdftotext -layout FILE 

在Linux机器上。 (当然它也应该便宜)

我刚刚尝试了一些IText,PDFBox和PDFTextStream的代码片段。 到目前为止,最准确的解决方案是PDFTextStream,它使用VisualOutputTarget来获得我文件的绝佳表示。

所以我的列布局被认可是正确的,我可以使用它。 但IText也应该有解决方案,或者?

我发现的每个简单片段都会产生简单有序的字符串,这些字符串很混乱(混乱行/列/行)。 是否有任何解决方案可能更容易,可能不涉及自己的战略? 或者是否有可以使用的开源策略?

//我按照mkl的说明编写了自己的策略对象,如下所示:

 package com.test.pdfextractiontest.itext; import ... public class MyLocationTextExtractionStrategy implements TextExtractionStrategy { /** set to true for debugging */ static boolean DUMP_STATE = false; /** a summary of all found text */ private final List locationalResult = new ArrayList(); public MyLocationTextExtractionStrategy() { } @Override public void beginTextBlock() { } @Override public void endTextBlock() { } private boolean startsWithSpace(final String str) { if (str.length() == 0) { return false; } return str.charAt(0) == ' '; } private boolean endsWithSpace(final String str) { if (str.length() == 0) { return false; } return str.charAt(str.length() - 1) == ' '; } private List filterTextChunks(final List textChunks, final TextChunkFilter filter) { if (filter == null) { return textChunks; } final List filtered = new ArrayList(); for (final TextChunk textChunk : textChunks) { if (filter.accept(textChunk)) { filtered.add(textChunk); } } return filtered; } protected boolean isChunkAtWordBoundary(final TextChunk chunk, final TextChunk previousChunk) { final float dist = chunk.distanceFromEndOf(previousChunk); if (dist  chunk.getCharSpaceWidth() / 2.0f) { return true; } return false; } public String getResultantText(final TextChunkFilter chunkFilter) { if (DUMP_STATE) { dumpState(); } final List filteredTextChunks = filterTextChunks(this.locationalResult, chunkFilter); Collections.sort(filteredTextChunks); final StringBuffer sb = new StringBuffer(); TextChunk lastChunk = null; for (final TextChunk chunk : filteredTextChunks) { if (lastChunk == null) { sb.append(chunk.text); } else { if (chunk.sameLine(lastChunk)) { if (isChunkAtWordBoundary(chunk, lastChunk) && !startsWithSpace(chunk.text) && !endsWithSpace(lastChunk.text)) { sb.append(' '); } final Float dist = chunk.distanceFromEndOf(lastChunk)/3; for(int i = 0; i<Math.round(dist); i++) { sb.append(' '); } sb.append(chunk.text); } else { sb.append('\n'); sb.append(chunk.text); } } lastChunk = chunk; } return sb.toString(); } 

使用生成的文本生成一个String。 * / @Override public String getResultantText(){

  return getResultantText(null); } private void dumpState() { for (final TextChunk location : this.locationalResult) { location.printDiagnostics(); System.out.println(); } } @Override public void renderText(final TextRenderInfo renderInfo) { LineSegment segment = renderInfo.getBaseline(); if (renderInfo.getRise() != 0) { final Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise()); segment = segment.transformBy(riseOffsetTransform); } final TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth(),renderInfo); this.locationalResult.add(location); } public static class TextChunk implements Comparable { /** the text of the chunk */ private final String text; /** the starting location of the chunk */ private final Vector startLocation; /** the ending location of the chunk */ private final Vector endLocation; /** unit vector in the orientation of the chunk */ private final Vector orientationVector; /** the orientation as a scalar for quick sorting */ private final int orientationMagnitude; private final TextRenderInfo info; private final int distPerpendicular; private final float distParallelStart; private final float distParallelEnd; /** the width of a single space character in the font of the chunk */ private final float charSpaceWidth; public TextChunk(final String string, final Vector startLocation, final Vector endLocation, final float charSpaceWidth,final TextRenderInfo ri) { this.text = string; this.startLocation = startLocation; this.endLocation = endLocation; this.charSpaceWidth = charSpaceWidth; this.info = ri; Vector oVector = endLocation.subtract(startLocation); if (oVector.length() == 0) { oVector = new Vector(1, 0, 0); } this.orientationVector = oVector.normalize(); this.orientationMagnitude = (int) (Math.atan2(this.orientationVector.get(Vector.I2), this.orientationVector.get(Vector.I1)) * 1000); final Vector origin = new Vector(0, 0, 1); this.distPerpendicular = (int) startLocation.subtract(origin).cross(this.orientationVector).get(Vector.I3); this.distParallelStart = this.orientationVector.dot(startLocation); this.distParallelEnd = this.orientationVector.dot(endLocation); } public Vector getStartLocation() { return this.startLocation; } public Vector getEndLocation() { return this.endLocation; } public String getText() { return this.text; } public float getCharSpaceWidth() { return this.charSpaceWidth; } private void printDiagnostics() { System.out.println("Text (@" + this.startLocation + " -> " + this.endLocation + "): " + this.text); System.out.println("orientationMagnitude: " + this.orientationMagnitude); System.out.println("distPerpendicular: " + this.distPerpendicular); System.out.println("distParallel: " + this.distParallelStart); } public boolean sameLine(final TextChunk as) { if (this.orientationMagnitude != as.orientationMagnitude) { return false; } if (this.distPerpendicular != as.distPerpendicular) { return false; } return true; } public float distanceFromEndOf(final TextChunk other) { final float distance = this.distParallelStart - other.distParallelEnd; return distance; } public float myDistanceFromEndOf(final TextChunk other) { final float distance = this.distParallelStart - other.distParallelEnd; return distance; } @Override public int compareTo(final TextChunk rhs) { if (this == rhs) { return 0; // not really needed, but just in case } int rslt; rslt = compareInts(this.orientationMagnitude, rhs.orientationMagnitude); if (rslt != 0) { return rslt; } rslt = compareInts(this.distPerpendicular, rhs.distPerpendicular); if (rslt != 0) { return rslt; } return Float.compare(this.distParallelStart, rhs.distParallelStart); } private static int compareInts(final int int1, final int int2) { return int1 == int2 ? 0 : int1 < int2 ? -1 : 1; } public TextRenderInfo getInfo() { return this.info; } } @Override public void renderImage(final ImageRenderInfo renderInfo) { // do nothing } public static interface TextChunkFilter { public boolean accept(TextChunk textChunk); } } 

正如您所看到的,大多数与原始类相同。 我刚添加了这个:

  final Float dist = chunk.distanceFromEndOf(lastChunk)/3; for(int i = 0; i<Math.round(dist); i++) { sb.append(' '); } 

getResultantText方法用空格扩展间隙。 但问题出在这里:

距离似乎不准确或不准确。 结果看起来像

这个: 这个:

有没有人知道如何计算更好的距离或价值? 我认为它是因为原始字体类型是ArialMT而我的编辑器是快递的,但是为了使用这张表,我建议我可以将表拆分到正确的位置以获取我的数据。 由于浮动开始和结束值usw,这很难。

: – /

你的方法插入像这样的空间的问题

  final Float dist = chunk.distanceFromEndOf(lastChunk)/3; for(int i = 0; i 

假设字符宽度宽度为3个用户空间单位,它假定StringBuffer中的当前位置与lastChunk的末尾完全对应。 不一定是这种情况,通常每增加一个字符就会破坏这种以前的对应关系。 例如,当使用比例字体时,这两行的宽度不同:

ililili

MWMWMWM

而在StringBuffer它们占用相同的长度。

因此,您必须查看chunk 相对于左页边框的起始位置并相应地向缓冲区添加空格。

此外,您的代码完全忽略行开头的可用空间。

如果您使用此代码替换原始方法getResultantText(TextChunkFilter ,您的结果应该会改进:

 public String getResultantText(TextChunkFilter chunkFilter){ if (DUMP_STATE) dumpState(); List filteredTextChunks = filterTextChunks(locationalResult, chunkFilter); Collections.sort(filteredTextChunks); int startOfLinePosition = 0; StringBuffer sb = new StringBuffer(); TextChunk lastChunk = null; for (TextChunk chunk : filteredTextChunks) { if (lastChunk == null){ insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false); sb.append(chunk.text); } else { if (chunk.sameLine(lastChunk)) { if (isChunkAtWordBoundary(chunk, lastChunk)) { insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, !startsWithSpace(chunk.text) && !endsWithSpace(lastChunk.text)); } sb.append(chunk.text); } else { sb.append('\n'); startOfLinePosition = sb.length(); insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false); sb.append(chunk.text); } } lastChunk = chunk; } return sb.toString(); } void insertSpaces(StringBuffer sb, int startOfLinePosition, float chunkStart, boolean spaceRequired) { int indexNow = sb.length() - startOfLinePosition; int indexToBe = (int)((chunkStart - pageLeft) / fixedCharWidth); int spacesToInsert = indexToBe - indexNow; if (spacesToInsert < 1 && spaceRequired) spacesToInsert = 1; for (; spacesToInsert > 0; spacesToInsert--) { sb.append(' '); } } public float pageLeft = 0; public float fixedCharWidth = 6; 

pageLeft是左页边框的坐标。 该战略不了解它,因此必须明确告知; 但在许多情况下,0是正确的值。

或者,可以使用所有块的最小distParallelStart值。 这会切断左边距,但不会要求您注入精确的左页边框值。

fixedCharWidth是假定的字符宽度。 根据所讨论的PDF中的写作,不同的值可能更适合。 在你的情况下,值3似乎比我的6更好。

此代码仍有很大的改进空间。 例如

  • 它假定没有跨越多个表列的文本块。 这种假设通常是正确的,但是我已经看到了奇怪的PDF,其中在一些偏移处使用单独的文本块实现了正常的字间距,但是列间距由单个块中的单个空格字符表示(跨越)一列的结尾和下一列的开头)! 该空格字符的宽度已由PDF图形状态的字间距设置操纵。

  • 它忽略了不同数量的垂直空间。