From 806e5a9591053f6aaa4417a9ae8e0668abfeba03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Wed, 6 May 2026 11:09:30 +0200 Subject: [PATCH] :recycle: use an interface to define PDF rasterization --- .../java/com/mindee/image/ImageExtractor.java | 43 ++++------------- .../com/mindee/image/PDFRasterization.java | 14 ++++++ .../java/com/mindee/image/PDFRasterizer.java | 46 +++++++++++++++++++ .../com/mindee/input/LocalInputSource.java | 2 +- .../java/com/mindee/pdf/BasePDFExtractor.java | 26 +++++------ .../java/com/mindee/pdf/PDFCompression.java | 9 ++-- .../java/com/mindee/pdf/PDFCompressor.java | 2 +- .../com/mindee/pdf/PDFInputOperation.java | 3 ++ 8 files changed, 93 insertions(+), 52 deletions(-) create mode 100644 src/main/java/com/mindee/image/PDFRasterization.java create mode 100644 src/main/java/com/mindee/image/PDFRasterizer.java diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index 3cf0c6804..82c8981b3 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -10,11 +10,6 @@ import java.util.ArrayList; import java.util.List; import javax.imageio.ImageIO; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; /** * Extract sub-images from an image. @@ -30,7 +25,7 @@ public ImageExtractor(LocalInputSource source) throws IOException { if (source.isPDF()) { this.saveFormat = "jpg"; - var pdfPageImages = pdfToImages(source.getFile(), source.getFilename()); + var pdfPageImages = getPDFRasterizer().PDFToImages(source.getFile(), source.getFilename()); for (PDFPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } @@ -45,34 +40,14 @@ public ImageExtractor(LocalInputSource source) throws IOException { } } - private List pdfToImages(byte[] fileBytes, String filename) throws IOException { - PDDocument document = Loader.loadPDF(fileBytes); - var pdfRenderer = new PDFRenderer(document); - List pdfPageImages = new ArrayList<>(); - for (int i = 0; i < document.getNumberOfPages(); i++) { - var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); - pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg")); - } - document.close(); - return pdfPageImages; - } - - private BufferedImage pdfPageToImageBuffer( - int index, - PDDocument document, - PDFRenderer pdfRenderer - ) throws IOException { - PDRectangle bbox = document.getPage(index).getBBox(); - float dimension = bbox.getWidth() * bbox.getHeight(); - int dpi; - if (dimension < 200000) { - dpi = 300; - } else if (dimension < 300000) { - dpi = 250; - } else { - dpi = 200; - } - return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + /** + * Get the PDF rasterization implementation. + * Override this method to provide custom PDF rasterization handling. + * + * @return The PDF rasterization implementation. + */ + protected PDFRasterization getPDFRasterizer() { + return new PDFRasterizer(); } /** diff --git a/src/main/java/com/mindee/image/PDFRasterization.java b/src/main/java/com/mindee/image/PDFRasterization.java new file mode 100644 index 000000000..c143d9a8e --- /dev/null +++ b/src/main/java/com/mindee/image/PDFRasterization.java @@ -0,0 +1,14 @@ +package com.mindee.image; + +import java.io.IOException; +import java.util.List; + +/** + * Rasterize a PDF into images. + */ +public interface PDFRasterization { + /** + * Rasterize a PDF into a list of images, one image per page. + */ + List PDFToImages(byte[] fileBytes, String filename) throws IOException; +} diff --git a/src/main/java/com/mindee/image/PDFRasterizer.java b/src/main/java/com/mindee/image/PDFRasterizer.java new file mode 100644 index 000000000..35561210d --- /dev/null +++ b/src/main/java/com/mindee/image/PDFRasterizer.java @@ -0,0 +1,46 @@ +package com.mindee.image; + +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; + +/** + * Default PDF rasterization implementation. + */ +public class PDFRasterizer implements PDFRasterization { + public List PDFToImages(byte[] fileBytes, String filename) throws IOException { + PDDocument document = Loader.loadPDF(fileBytes); + var pdfRenderer = new PDFRenderer(document); + List pdfPageImages = new ArrayList<>(); + for (int i = 0; i < document.getNumberOfPages(); i++) { + var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); + pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg")); + } + document.close(); + return pdfPageImages; + } + + private BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + } +} diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index 12b196091..1dd89595e 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -61,7 +61,7 @@ public LocalInputSource(String fileAsBase64, String filename) { /** * Get the PDFInputOperation instance. - * Override this method to provide custom PDF input operation handling. + * Override this method to provide custom PDF input handling. * * @return PDFInputOperation instance */ diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index f1564e515..39a567aeb 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -82,6 +82,19 @@ public ExtractedPDFs extractSubDocuments(List> pageIndexes) throws return extractedPDFs; } + /** + * Make a nice filename for the split. + */ + protected String makeFilename(List pageNumbers) { + String[] splitName = InputSourceUtils.splitNameStrict(filename); + return splitName[0] + + String.format("_%3s", pageNumbers.get(0)).replace(" ", "0") + + "-" + + String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0") + + "." + + splitName[1]; + } + /** * Converts an array to a buffered image. * @@ -95,19 +108,6 @@ private static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws I } } - /** - * Make a nice filename for the split. - */ - private String makeFilename(List pageNumbers) { - String[] splitName = InputSourceUtils.splitNameStrict(filename); - return splitName[0] - + String.format("_%3s", pageNumbers.get(0)).replace(" ", "0") - + "-" - + String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0") - + "." - + splitName[1]; - } - private static PDPage clonePage(PDPage page) { COSDictionary pageDict = page.getCOSObject(); diff --git a/src/main/java/com/mindee/pdf/PDFCompression.java b/src/main/java/com/mindee/pdf/PDFCompression.java index e8619014d..1473515ae 100644 --- a/src/main/java/com/mindee/pdf/PDFCompression.java +++ b/src/main/java/com/mindee/pdf/PDFCompression.java @@ -2,23 +2,26 @@ import java.io.IOException; +/** + * Compress a PDF. + */ public interface PDFCompression { byte[] compressPDF( byte[] fileBytes, - Integer imageQuality, + int imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException; default byte[] compressPDF( byte[] fileBytes, - Integer imageQuality, + int imageQuality, Boolean forceSourceTextCompression ) throws IOException { return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true); } - default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException { + default byte[] compressPDF(byte[] fileBytes, int imageQuality) throws IOException { return compressPDF(fileBytes, imageQuality, false, true); } diff --git a/src/main/java/com/mindee/pdf/PDFCompressor.java b/src/main/java/com/mindee/pdf/PDFCompressor.java index 0932f1973..c0e3caa8b 100644 --- a/src/main/java/com/mindee/pdf/PDFCompressor.java +++ b/src/main/java/com/mindee/pdf/PDFCompressor.java @@ -36,7 +36,7 @@ public PDFCompressor() { @Override public byte[] compressPDF( byte[] fileBytes, - Integer imageQuality, + int imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException { diff --git a/src/main/java/com/mindee/pdf/PDFInputOperation.java b/src/main/java/com/mindee/pdf/PDFInputOperation.java index a69cc50af..2ec56ae30 100644 --- a/src/main/java/com/mindee/pdf/PDFInputOperation.java +++ b/src/main/java/com/mindee/pdf/PDFInputOperation.java @@ -3,6 +3,9 @@ import com.mindee.input.PageOptions; import java.io.IOException; +/** + * Various operations required for PDF input files. + */ public interface PDFInputOperation { /**