Skip to content

Commit 03b305e

Browse files
authored
♻️ 💥 rework PDF classes (#316)
1 parent a377bb4 commit 03b305e

File tree

15 files changed

+170
-163
lines changed

15 files changed

+170
-163
lines changed

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package com.mindee.input;
22

33
import com.mindee.image.ImageCompressor;
4+
import com.mindee.pdf.PDFBoxApi;
5+
import com.mindee.pdf.PDFCompressor;
6+
import com.mindee.pdf.PDFOperation;
47
import com.mindee.pdf.PDFUtils;
5-
import com.mindee.pdf.PdfBoxApi;
6-
import com.mindee.pdf.PdfCompressor;
7-
import com.mindee.pdf.PdfOperation;
88
import com.mindee.pdf.SplitQuery;
99
import java.io.File;
1010
import java.io.IOException;
@@ -13,16 +13,20 @@
1313
import java.nio.file.Path;
1414
import java.util.Base64;
1515
import lombok.Getter;
16+
import lombok.Setter;
1617
import org.apache.pdfbox.io.IOUtils;
1718

1819
/**
1920
* A source document for Mindee API operations.
2021
*/
21-
@Getter
2222
public final class LocalInputSource {
2323

24+
@Getter
2425
private byte[] file;
26+
@Getter
2527
private final String filename;
28+
@Setter
29+
private PDFOperation pdfOperation;
2630

2731
public LocalInputSource(InputStream file, String filename) throws IOException {
2832
this.file = IOUtils.toByteArray(file);
@@ -55,6 +59,13 @@ public LocalInputSource(String fileAsBase64, String filename) {
5559
this.filename = filename;
5660
}
5761

62+
public PDFOperation getPdfOperation() {
63+
if (this.pdfOperation == null) {
64+
this.pdfOperation = new PDFBoxApi();
65+
}
66+
return this.pdfOperation;
67+
}
68+
5869
/**
5970
* Get the number of pages in the document.
6071
*
@@ -76,8 +87,7 @@ public int getPageCount() throws IOException {
7687
*/
7788
public void applyPageOptions(PageOptions pageOptions) throws IOException {
7889
if (pageOptions != null && this.isPdf()) {
79-
PdfOperation pdfOperation = new PdfBoxApi();
80-
this.file = pdfOperation.split(new SplitQuery(this.file, pageOptions)).getFile();
90+
this.file = getPdfOperation().split(new SplitQuery(this.file, pageOptions)).getFile();
8191
}
8292
}
8393

@@ -97,7 +107,7 @@ public void compress(
97107
Boolean disableSourceText
98108
) throws IOException {
99109
if (isPdf()) {
100-
this.file = PdfCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
110+
this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
101111
} else {
102112
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
103113
}

src/main/java/com/mindee/pdf/PDFExtractor.java renamed to src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 7 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,11 @@
55
import com.mindee.MindeeException;
66
import com.mindee.input.InputSourceUtils;
77
import com.mindee.input.LocalInputSource;
8-
import com.mindee.v1.product.invoicesplitter.InvoiceSplitterV1InvoicePageGroup;
98
import java.awt.image.BufferedImage;
109
import java.io.ByteArrayInputStream;
1110
import java.io.IOException;
1211
import java.util.ArrayList;
13-
import java.util.Iterator;
1412
import java.util.List;
15-
import java.util.stream.Collectors;
1613
import javax.imageio.ImageIO;
1714
import org.apache.pdfbox.Loader;
1815
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -24,33 +21,23 @@
2421
/**
2522
* PDF extraction class.
2623
*/
27-
public class PDFExtractor {
28-
private final PDDocument sourcePdf;
29-
private final String filename;
30-
31-
/**
32-
* Init from a path.
33-
*
34-
* @param filePath Path to the file.
35-
* @throws IOException Throws if the file can't be accessed.
36-
*/
37-
public PDFExtractor(String filePath) throws IOException {
38-
this(new LocalInputSource(filePath));
39-
}
24+
public class BasePDFExtractor {
25+
protected final PDDocument sourcePdf;
26+
protected final String filename;
4027

4128
/**
4229
* Init from a {@link LocalInputSource}.
4330
*
4431
* @param source The local source.
4532
* @throws IOException Throws if the file can't be accessed.
4633
*/
47-
public PDFExtractor(LocalInputSource source) throws IOException {
34+
protected BasePDFExtractor(LocalInputSource source) throws IOException {
4835
this.filename = source.getFilename();
4936
if (source.isPdf()) {
5037
this.sourcePdf = Loader.loadPDF(source.getFile());
5138
} else {
52-
PDDocument document = new PDDocument();
53-
PDPage page = new PDPage();
39+
var document = new PDDocument();
40+
var page = new PDPage();
5441
document.addPage(page);
5542
BufferedImage bufferedImage = byteArrayToBufferedImage(source.getFile());
5643
PDImageXObject pdImage = LosslessFactory.createFromImage(document, bufferedImage);
@@ -65,7 +52,6 @@ public PDFExtractor(LocalInputSource source) throws IOException {
6552
);
6653
}
6754
this.sourcePdf = document;
68-
6955
}
7056
}
7157

@@ -101,7 +87,7 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
10187
public List<ExtractedPDF> extractSubDocuments(
10288
List<List<Integer>> pageIndexes
10389
) throws IOException {
104-
List<ExtractedPDF> extractedPDFs = new ArrayList<>();
90+
var extractedPDFs = new ArrayList<ExtractedPDF>();
10591

10692
for (List<Integer> pageIndexElement : pageIndexes) {
10793
if (pageIndexElement.isEmpty()) {
@@ -126,65 +112,4 @@ public List<ExtractedPDF> extractSubDocuments(
126112
}
127113
return extractedPDFs;
128114
}
129-
130-
/**
131-
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
132-
*
133-
* @param pageIndexes List of page indexes.
134-
* @return a list of extracted files.
135-
* @throws IOException Throws if the file can't be accessed.
136-
*/
137-
public List<ExtractedPDF> extractInvoices(
138-
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes
139-
) throws IOException {
140-
141-
List<List<Integer>> indexes = pageIndexes
142-
.stream()
143-
.map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
144-
.collect(Collectors.toList());
145-
146-
return extractSubDocuments(indexes);
147-
}
148-
149-
/**
150-
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
151-
*
152-
* @param pageIndexes List of page indexes.
153-
* @param strict Whether the extraction should strictly follow the confidence scores or not.
154-
* @return a list of extracted files.
155-
* @throws IOException Throws if the file can't be accessed.
156-
*/
157-
public List<ExtractedPDF> extractInvoices(
158-
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
159-
boolean strict
160-
) throws IOException {
161-
List<List<Integer>> correctPageIndexes = new ArrayList<>();
162-
if (!strict) {
163-
return extractInvoices(pageIndexes);
164-
}
165-
Iterator<InvoiceSplitterV1InvoicePageGroup> iterator = pageIndexes.iterator();
166-
List<Integer> currentList = new ArrayList<>();
167-
Double previousConfidence = null;
168-
while (iterator.hasNext()) {
169-
InvoiceSplitterV1InvoicePageGroup pageIndex = iterator.next();
170-
Double confidence = pageIndex.getConfidence();
171-
List<Integer> pageList = pageIndex.getPageIndexes();
172-
173-
if (confidence == 1.0 && previousConfidence == null) {
174-
currentList = new ArrayList<>(pageList);
175-
} else if (confidence == 1.0) {
176-
correctPageIndexes.add(currentList);
177-
currentList = new ArrayList<>(pageList);
178-
} else if (confidence == 0.0 && !iterator.hasNext()) {
179-
currentList.addAll(pageList);
180-
correctPageIndexes.add(currentList);
181-
} else {
182-
correctPageIndexes.add(currentList);
183-
correctPageIndexes.add(pageList);
184-
}
185-
previousConfidence = confidence;
186-
}
187-
return extractSubDocuments(correctPageIndexes);
188-
}
189-
190115
}

src/main/java/com/mindee/pdf/PdfBoxApi.java renamed to src/main/java/com/mindee/pdf/PDFBoxApi.java

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,32 +11,30 @@
1111
import java.util.Set;
1212
import java.util.stream.Collectors;
1313
import java.util.stream.IntStream;
14-
import java.util.stream.Stream;
1514
import org.apache.pdfbox.Loader;
1615
import org.apache.pdfbox.pdmodel.PDDocument;
1716

1817
/**
1918
* Allows performing various operations on PDFs.
2019
*/
21-
public final class PdfBoxApi implements PdfOperation {
20+
public final class PDFBoxApi implements PDFOperation {
2221

2322
@Override
24-
public SplitPdf split(SplitQuery splitQuery) throws IOException {
23+
public SplitPDF split(SplitQuery splitQuery) throws IOException {
2524

2625
if (!checkPdfOpen(splitQuery.getFile())) {
2726
throw new MindeeException("This document cannot be open and cannot be split.");
2827
}
2928

30-
try (PDDocument originalDocument = Loader.loadPDF(splitQuery.getFile())) {
31-
try (PDDocument splitDocument = new PDDocument()) {
29+
try (var originalDocument = Loader.loadPDF(splitQuery.getFile())) {
30+
try (var splitDocument = new PDDocument()) {
3231
int totalOriginalPages = countPages(splitQuery.getFile());
3332

3433
if (totalOriginalPages < splitQuery.getPageOptions().getOnMinPages()) {
35-
return new SplitPdf(splitQuery.getFile(), totalOriginalPages);
34+
return new SplitPDF(splitQuery.getFile(), totalOriginalPages);
3635
}
3736

38-
List<Integer> pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);
39-
37+
var pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);
4038
pageRange
4139
.stream()
4240
.filter(i -> i < totalOriginalPages)
@@ -45,7 +43,7 @@ public SplitPdf split(SplitQuery splitQuery) throws IOException {
4543
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
4644
splitDocument.save(outputStream);
4745
byte[] splitPdf = outputStream.toByteArray();
48-
return new SplitPdf(splitPdf, countPages(splitPdf));
46+
return new SplitPDF(splitPdf, countPages(splitPdf));
4947
}
5048
}
5149
}
@@ -55,12 +53,12 @@ private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPag
5553

5654
Set<Integer> pages = Optional
5755
.ofNullable(pageOptions.getPageIndexes())
58-
.map(Collection::stream)
59-
.orElseGet(Stream::empty)
56+
.stream()
57+
.flatMap(Collection::stream)
6058
.filter(x -> x > (numberOfPages) * (-1) && x <= (numberOfPages - 1))
6159
.map(x -> (numberOfPages + x) % numberOfPages)
6260
.collect(Collectors.toSet());
63-
List<Integer> allPages = IntStream.range(0, numberOfPages).boxed().collect(Collectors.toList());
61+
var allPages = IntStream.range(0, numberOfPages).boxed().collect(Collectors.toList());
6462

6563
switch (pageOptions.getOperation()) {
6664
case KEEP_ONLY:
@@ -85,9 +83,6 @@ private boolean checkPdfOpen(byte[] documentFile) {
8583
}
8684

8785
private int countPages(byte[] documentFile) throws IOException {
88-
PDDocument document = Loader.loadPDF(documentFile);
89-
int pageCount = document.getNumberOfPages();
90-
document.close();
91-
return pageCount;
86+
return PDFUtils.getNumberOfPages(documentFile);
9287
}
9388
}

src/main/java/com/mindee/pdf/PdfCompressor.java renamed to src/main/java/com/mindee/pdf/PDFCompressor.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,13 @@
1111
import org.apache.pdfbox.pdmodel.PDPageContentStream;
1212
import org.apache.pdfbox.pdmodel.common.PDRectangle;
1313
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
14-
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
1514
import org.apache.pdfbox.rendering.ImageType;
1615
import org.apache.pdfbox.rendering.PDFRenderer;
1716

1817
/**
1918
* PDF compression class.
2019
*/
21-
public class PdfCompressor {
20+
public class PDFCompressor {
2221
public static byte[] compressPdf(
2322
byte[] pdfData,
2423
Integer imageQuality,
@@ -44,10 +43,10 @@ public static byte[] compressPdf(
4443
}
4544
try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) {
4645

47-
PDFRenderer pdfRenderer = new PDFRenderer(inputDoc);
46+
var pdfRenderer = new PDFRenderer(inputDoc);
4847

4948
for (int pageIndex = 0; pageIndex < inputDoc.getNumberOfPages(); pageIndex++) {
50-
PDPage originalPage = inputDoc.getPage(pageIndex);
49+
var originalPage = inputDoc.getPage(pageIndex);
5150
PDRectangle originalPageSize = originalPage.getMediaBox();
5251

5352
processPage(
@@ -92,12 +91,12 @@ private static void processPage(
9291
PDRectangle originalPageSize,
9392
Boolean disableSourceText
9493
) throws IOException {
95-
PDPage newPage = new PDPage(originalPageSize);
94+
var newPage = new PDPage(originalPageSize);
9695
outputDoc.addPage(newPage);
9796

98-
PDImageXObject pdImage = JPEGFactory.createFromImage(outputDoc, image, imageQuality);
97+
var pdImage = JPEGFactory.createFromImage(outputDoc, image, imageQuality);
9998

100-
try (PDPageContentStream contentStream = new PDPageContentStream(outputDoc, newPage)) {
99+
try (var contentStream = new PDPageContentStream(outputDoc, newPage)) {
101100
PDFUtils.addImageToPage(contentStream, pdImage, originalPageSize);
102101
PDFUtils.extractAndAddText(originalDocument, contentStream, pageIndex, disableSourceText);
103102
}

src/main/java/com/mindee/pdf/PdfOperation.java renamed to src/main/java/com/mindee/pdf/PDFOperation.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
/**
66
* Minimum PDF operations.
77
*/
8-
public interface PdfOperation {
8+
public interface PDFOperation {
99

1010
/**
1111
* Split a PDF file.
1212
*
1313
* @param splitQuery Options to perform the query.
1414
* @return The split PDF.
1515
*/
16-
SplitPdf split(SplitQuery splitQuery) throws IOException;
16+
SplitPDF split(SplitQuery splitQuery) throws IOException;
1717
}

src/main/java/com/mindee/pdf/PDFUtils.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ private static byte[] createPdfFromExistingPdf(
6969
List<Integer> pageNumbers,
7070
boolean closeOriginal
7171
) throws IOException {
72-
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
73-
PDDocument newDocument = new PDDocument();
72+
var outputStream = new ByteArrayOutputStream();
73+
var newDocument = new PDDocument();
7474
int pageCount = document.getNumberOfPages();
7575
pageNumbers
7676
.stream()
@@ -161,7 +161,7 @@ public static List<PdfPageImage> pdfToImages(String filePath) throws IOException
161161
*/
162162
public static List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
163163
PDDocument document = Loader.loadPDF(source.getFile());
164-
PDFRenderer pdfRenderer = new PDFRenderer(document);
164+
var pdfRenderer = new PDFRenderer(document);
165165
List<PdfPageImage> pdfPageImages = new ArrayList<>();
166166
for (int i = 0; i < document.getNumberOfPages(); i++) {
167167
BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
@@ -201,7 +201,7 @@ public static PdfPageImage pdfPageToImage(
201201
) throws IOException {
202202
int index = pageNumber - 1;
203203
PDDocument document = Loader.loadPDF(source.getFile());
204-
PDFRenderer pdfRenderer = new PDFRenderer(document);
204+
var pdfRenderer = new PDFRenderer(document);
205205
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
206206
document.close();
207207
return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
@@ -226,7 +226,7 @@ private static BufferedImage pdfPageToImageBuffer(
226226
}
227227

228228
public static byte[] documentToBytes(PDDocument document) throws IOException {
229-
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
229+
var outputStream = new ByteArrayOutputStream();
230230
document.save(outputStream);
231231
return outputStream.toByteArray();
232232
}

0 commit comments

Comments
 (0)