使用Java从PDF中提取图像

我需要仅从PDF中提取条形码(使用矩形),而不是将整个PDF转换为图像。

图像格式可以是jpg / png。

使用PDF Box ,无需编码:

"$JAVA_HOME/bin/java" -jar pdfbox-app-1.8.2.jar PDFToImage foo.pdf 

要进行批处理:

 import java.io.File; import java.io.FilenameFilter; import java.util.Arrays; import java.util.List; import java.util.Observer; import org.apache.pdfbox.PDFToImage; public class Main { static { System.setProperty( "org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog" ); } public static int extract( List< File > files, File jpegDir, Observer observer ) { jpegDir.mkdirs(); int done = 0; for( final File file : files ) { try { final File target = new File( jpegDir, file.getName()); final String trgtPath = target.getPath(); final String prefix = trgtPath.substring( 0, trgtPath.lastIndexOf( '.' )); PDFToImage.main( new String[]{ "-outputPrefix", prefix, file.getPath() }); final double percent = (100.0 * ++done ) / files.size(); System.out.printf( "%6.2f %%: %s\n", percent, file.getName()); if( observer != null ) { observer.update( null, file ); } } catch( final Throwable t ) { System.err.println( file.getPath()); t.printStackTrace(); } } return done; } public static void main( String[] args ) { if( args.length != 2 ) { System.err.println( "usage: java -jar pdf2img.jar  " ); System.exit(1); } final File pdfDir = new File( args[0] ); final File jpegDir = new File( args[1] ); final File[] files = pdfDir.listFiles( new FilenameFilter() { @Override public boolean accept( File dir, String name ) { return name.toLowerCase().endsWith( ".pdf" ); }}); if( files != null ) { final int done = extract( Arrays.asList( files ), jpegDir, null ); System.out.printf( "\n%d file%s processed.", done, ( done > 1 ) ? "s" : "" ); } } } 

此实用程序可能与GUI(法语本地化)关联: PDF到图像提取器GUI

 public final class GUI extends Application { @Override public void start( Stage primaryStage ) throws Exception { final BorderPane pane = new BorderPane(); final HBox topPane = new HBox(); final Label lbl = new Label( "Répertoire des images : " ); final TextField jpegDir = new TextField(); final Button browseBtn = new Button( "Parcourir..." ); final TableView< File > filesVw = new TableView<>(); lbl.setAlignment( Pos.CENTER_LEFT ); lbl .setStyle( "-fx-padding:8px; -fx-margin:8px;" ); jpegDir .setStyle( "-fx-padding:8px; -fx-margin:8px;" ); browseBtn.setStyle( "-fx-padding:8px; -fx-margin:8px;" ); topPane.getChildren().addAll( lbl, jpegDir, browseBtn ); pane.setTop( topPane ); pane.setCenter( filesVw ); jpegDir.setPrefColumnCount( 40 ); jpegDir.setEditable( false ); final ObservableList< TableColumn< File, ? >> columns = filesVw.getColumns(); final TableColumn< File, String > name = new TableColumn<>( "Nom" ); name.setCellValueFactory( new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){ @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){ return new SimpleStringProperty( p.getValue().getName()); }}); name.setSortable( false ); name.setPrefWidth( 400 ); columns.add( name ); final TableColumn< File, String > size = new TableColumn<>( "Taille" ); size.setCellValueFactory( new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){ @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){ return new SimpleStringProperty( String.format( "%,12d", p.getValue().length())); }}); size.setSortable( true ); size.setPrefWidth( 80 ); columns.add( size ); final TableColumn< File, String > date = new TableColumn<>( "Date" ); final SimpleDateFormat sdf = new SimpleDateFormat( "dd/MM/YYYY HH:mm" ); date.setCellValueFactory( new Callback< CellDataFeatures< File, String >, ObservableValue< String >>(){ @Override public ObservableValue< String > call( CellDataFeatures< File, String > p ){ return new SimpleStringProperty( sdf.format( new Date( p.getValue().lastModified()))); }}); date.setSortable( true ); date.setPrefWidth( 120 ); columns.add( date ); final Map< File, SimpleBooleanProperty > dones = new HashMap<>(); final TableColumn< File, Boolean > done = new TableColumn<>( "Traité" ); done.setCellValueFactory( new Callback< CellDataFeatures< File, Boolean >, ObservableValue< Boolean >>(){ @Override public ObservableValue< Boolean > call( CellDataFeatures< File, Boolean > p ){ return dones.get( p.getValue()); }}); done.setCellFactory( new Callback,TableCell>(){ @Override public TableCell call( TableColumn p ){ return new CheckBoxTableCell<>(); }}); done.setSortable( true ); done.setPrefWidth( 40 ); columns.add( done ); jpegDir.setOnDragOver(new EventHandler () { @Override public void handle(DragEvent event) { if (event.getGestureSource() != jpegDir ) { event.acceptTransferModes(TransferMode.COPY_OR_MOVE); } event.consume(); }}); jpegDir.setOnDragDropped(new EventHandler () { @Override public void handle(DragEvent event) { final Dragboard db = event.getDragboard(); boolean success = false; if( db.hasFiles()) { jpegDir.setText( db.getFiles().get( 0 ).getPath()); success = true; } event.setDropCompleted( success ); event.consume(); }}); filesVw.setOnDragOver(new EventHandler () { @Override public void handle(DragEvent event) { if( event.getGestureSource() != filesVw && ! jpegDir.getText().isEmpty()) { event.acceptTransferModes(TransferMode.COPY_OR_MOVE); } event.consume(); }}); filesVw.setOnDragDropped(new EventHandler () { @Override public void handle(DragEvent event) { final Dragboard db = event.getDragboard(); boolean success = false; if( db.hasFiles()) { final List< File > files = db.getFiles(); final File target = new File( jpegDir.getText()); for( final File f : files ) { dones.put( f, new SimpleBooleanProperty( false )); } filesVw.getItems().addAll( files ); filesVw.setDisable( true ); new Thread(){@Override public void run() { Main.extract( files, target, new Observer(){ @Override public void update( Observable o, final Object file ) { Platform.runLater( new Runnable() { @Override public void run() { dones.get( file ).setValue( Boolean.TRUE ); }}); }}); Platform.runLater( new Runnable() { @Override public void run() { filesVw.setDisable( false ); }}); }}.start(); success = true; } event.setDropCompleted( success ); event.consume(); }}); primaryStage.setScene( new Scene( pane )); primaryStage.setX( 0 ); primaryStage.setY( 0 ); primaryStage.show(); } public static void main( String[] args ) { launch(); } } 

你可以使用Pdfbox

 List pages = document.getDocumentCatalog().getAllPages(); Iterator iter = pages.iterator(); while( iter.hasNext() ) { PDPage page = (PDPage)iter.next(); PDResources resources = page.getResources(); Map images = resources.getImages(); if( images != null ) { Iterator imageIter = images.keySet().iterator(); while( imageIter.hasNext() ) { String key = (String)imageIter.next(); PDXObjectImage image = (PDXObjectImage)images.get( key ); String name = getUniqueFileName( key, image.getSuffix() ); System.out.println( "Writing image:" + name ); image.write2file( name ); } } } 

参考源代码

尝试jpedal,这将工作。 它可以提取几乎任何类型的对象(图像,文本..)

jpedal-Java开发人员库

JPedal的PDFDecoder API将帮助您提取单词。

 // Decode the page decodePdf.decodePage(page); // Create the grouping object to apply grouping to the data PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject(); // Bounding box for the whole page PdfPageData currentPageData = decodePdf.getPdfPageData(); int x1 = currentPageData.getMediaBoxX(page); int x2 = currentPageData.getMediaBoxWidth(page)+x1; int y2 = currentPageData.getMediaBoxX(page); int y1 = currentPageData.getMediaBoxHeight(page)-y2; // Extract words List words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, true, "&:=()!;.,\\/\"\"\'\'"); 

现在,遍历PDF中单词的列表。 希望它有效。 谢谢!