diff --git a/.github/workflows/_publish-code.yml b/.github/workflows/_publish-code.yml index 3bf9d676..e6f84a50 100644 --- a/.github/workflows/_publish-code.yml +++ b/.github/workflows/_publish-code.yml @@ -7,20 +7,27 @@ on: workflow_call: workflow_dispatch: +permissions: + contents: read + jobs: publish: - name: Publish release to PyPi + name: Publish ${{ matrix.variant }} release to PyPi runs-on: ubuntu-latest strategy: + fail-fast: false + max-parallel: 1 matrix: - python-version: [ "3.11" ] + variant: [ full, lite ] + env: + PYTHON_VERSION: "3.13" steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v6 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ env.PYTHON_VERSION }} - name: Cache dependencies uses: actions/cache@v5 @@ -31,7 +38,17 @@ jobs: ${{ runner.os }}-build- - name: Install dependencies - run: python -m pip install -e .[build] + run: | + if [ "${{ matrix.variant }}" = "lite" ]; then + python -m pip install toml + fi + python -m pip install -e '.[build]' + + - name: Generate lite TOML + if: matrix.variant == 'lite' + run: | + python scripts/generate_lite_toml.py + cp pyproject-lite.toml pyproject.toml - name: Build run: | @@ -49,6 +66,7 @@ jobs: run: twine upload --disable-progress-bar --repository testpypi ./dist/* - name: Upload + if: startsWith(github.ref, 'refs/tags/') env: TWINE_NON_INTERACTIVE: "1" TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} diff --git a/.github/workflows/_publish-docs.yml b/.github/workflows/_publish-docs.yml index e401ab76..35bf98fa 100644 --- a/.github/workflows/_publish-docs.yml +++ b/.github/workflows/_publish-docs.yml @@ -15,10 +15,10 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | - python -m pip install -e .[docs] + python -m pip install -e '.[docs]' - name: Sphinx make run: | diff --git a/.github/workflows/_smoke-test.yml b/.github/workflows/_smoke-test.yml index cbeb075e..24fe7e1f 100644 --- a/.github/workflows/_smoke-test.yml +++ b/.github/workflows/_smoke-test.yml @@ -25,12 +25,12 @@ jobs: - "3.14" runs-on: "ubuntu-22.04" steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -45,7 +45,7 @@ jobs: - name: Install dependencies run: | python -m pip install pip - pip install -e . + pip install -e '.' - name: Tests v2 sample code run: | diff --git a/.github/workflows/_static-analysis.yml b/.github/workflows/_static-analysis.yml index 43b63b09..92ce94d2 100644 --- a/.github/workflows/_static-analysis.yml +++ b/.github/workflows/_static-analysis.yml @@ -14,10 +14,10 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -33,7 +33,7 @@ jobs: run: | python -m pip install pip pip install pylic~=3.6.1 - pip install -e . + pip install -e '.' - name: License check run: | @@ -41,7 +41,7 @@ jobs: - name: Install lint dependencies run: | - pip install -e .[lint] + pip install -e '.[lint]' - name: Cache pre-commit uses: actions/cache@v5 diff --git a/.github/workflows/_test-integrations.yml b/.github/workflows/_test-integrations.yml index 11ba9b1a..3d1b8c19 100644 --- a/.github/workflows/_test-integrations.yml +++ b/.github/workflows/_test-integrations.yml @@ -7,6 +7,9 @@ on: workflow_call: workflow_dispatch: +permissions: + contents: read + jobs: pytest: name: Run Integration Tests @@ -21,12 +24,12 @@ jobs: - "3.14" runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -41,7 +44,7 @@ jobs: - name: Install dependencies run: | python -m pip install pip - pip install -e .[test] + pip install -e '.[test]' - name: Run Integration Testing env: MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} @@ -57,7 +60,7 @@ jobs: pytest --cov mindee -m integration - name: Notify Slack Action on Failure - uses: ravsamhq/notify-slack-action@2.3.0 + uses: ravsamhq/notify-slack-action@2.5.0 if: ${{ always() && github.ref_name == 'main' }} with: status: ${{ job.status }} @@ -65,3 +68,52 @@ jobs: notification_title: "[Python] Integration test '{workflow}' is failing" env: SLACK_WEBHOOK_URL: ${{ secrets.PRODUCTION_ISSUES_SLACK_HOOK_URL }} + + pytest-lite: + name: Run Integration Tests + timeout-minutes: 30 + strategy: + matrix: + os: + - "ubuntu-22.04" + python-version: + - "3.10" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache dependencies + uses: actions/cache@v5 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-test-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-test- + + - name: Install dependencies + run: | + pip install toml + python scripts/generate_lite_toml.py + mv -f pyproject-lite.toml pyproject.toml + pip install -e '.[test]' + shell: bash + - name: Run Integration Testing + env: + MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} + WORKFLOW_ID: ${{ secrets.WORKFLOW_ID_SE_TESTS }} + MINDEE_V2_API_KEY: ${{ secrets.MINDEE_V2_SE_TESTS_API_KEY }} + MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID }} + MINDEE_V2_SE_TESTS_BLANK_PDF_URL: ${{ secrets.MINDEE_V2_SE_TESTS_BLANK_PDF_URL }} + MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID }} + MINDEE_V2_SE_TESTS_CROP_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CROP_MODEL_ID }} + MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID }} + MINDEE_V2_SE_TESTS_OCR_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_OCR_MODEL_ID }} + run: | + pytest -m "integration and not pypdfium2 and not pillow" diff --git a/.github/workflows/_test-regressions.yml b/.github/workflows/_test-regressions.yml index f685ea9e..fcbc2166 100644 --- a/.github/workflows/_test-regressions.yml +++ b/.github/workflows/_test-regressions.yml @@ -41,7 +41,7 @@ jobs: - name: Install dependencies run: | python -m pip install pip - pip install -e .[test] + pip install -e '.[test]' - name: Run Regression Testing env: MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} diff --git a/.github/workflows/_test-units.yml b/.github/workflows/_test-units.yml index ac23cdc9..071051fc 100644 --- a/.github/workflows/_test-units.yml +++ b/.github/workflows/_test-units.yml @@ -23,12 +23,12 @@ jobs: - "3.14" runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: submodules: recursive - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -43,10 +43,53 @@ jobs: - name: Install dependencies run: | python -m pip install pip - pip install -e .[test] + pip install -e '.[test]' - name: Unit testing with pytest - env: - MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} run: | pytest --cov mindee --cov-fail-under 87 + + pytest-lite: + name: Run Unit Tests (Lite) + timeout-minutes: 30 + strategy: + matrix: + os: + - "ubuntu-22.04" + - "windows-2022" + python-version: + - "3.10" + - "3.11" + - "3.12" + - "3.13" + - "3.14" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache dependencies + uses: actions/cache@v5 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-test-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-test- + + - name: Install dependencies + run: | + pip install toml + python scripts/generate_lite_toml.py + mv -f pyproject-lite.toml pyproject.toml + pip install -e '.[test]' + shell: bash + - name: Unit testing with pytest + run: | + pytest + diff --git a/.github/workflows/_workflow_lint.yml b/.github/workflows/_workflow_lint.yml new file mode 100644 index 00000000..8eb21904 --- /dev/null +++ b/.github/workflows/_workflow_lint.yml @@ -0,0 +1,20 @@ +name: Lint workflows + +on: + workflow_call: + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Download actionlint + id: get_actionlint + run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) + shell: bash + - name: Run actionlint + run: ${{ steps.get_actionlint.outputs.executable }} -color + shell: bash diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 89f5020f..79bd91f2 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -8,8 +8,11 @@ permissions: pull-requests: read jobs: + workflow-lint: + uses: ./.github/workflows/_workflow_lint.yml static-analysis: uses: ./.github/workflows/_static-analysis.yml + needs: workflow-lint test-units: uses: ./.github/workflows/_test-units.yml needs: static-analysis diff --git a/.gitignore b/.gitignore index 53ef3307..8e89f73d 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,4 @@ _test.py _test*.py _test.json local_test +pyproject-lite.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9ef3aca5..c3990a97 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.14.9 + rev: v0.15.16 hooks: - id: ruff-check args: [ --fix, --exit-non-zero-on-fix] @@ -27,9 +27,11 @@ repos: hooks: - id: pip-audit args: ["."] + files: ^(requirements.*\.txt|setup\.cfg|setup\.py|pyproject\.toml)$ + stages: [pre-push] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.18.2 + rev: v2.1.0 hooks: - id: mypy args: [] @@ -39,6 +41,7 @@ repos: - types-setuptools - importlib-metadata - types-Pillow + - types-toml - repo: local hooks: - id: sphinx-html @@ -47,6 +50,7 @@ repos: language: system pass_filenames: false files: ^docs/.*$|^mindee/.*\.py$ + stages: [pre-push] - id: sphinx-linkcheck name: Sphinx Linkcheck @@ -54,3 +58,4 @@ repos: language: system pass_filenames: false files: ^docs/.*$|^mindee/.*\.py$ + stages: [pre-push] diff --git a/mindee/dependencies/__init__.py b/mindee/dependencies/__init__.py new file mode 100644 index 00000000..4fb7c41d --- /dev/null +++ b/mindee/dependencies/__init__.py @@ -0,0 +1,9 @@ +from mindee.dependencies.checkers import PILLOW_AVAILABLE, PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pillow, requires_pypdfium2 + +__all__ = [ + "PILLOW_AVAILABLE", + "PYPDFIUM2_AVAILABLE", + "requires_pillow", + "requires_pypdfium2", +] diff --git a/mindee/dependencies/checkers.py b/mindee/dependencies/checkers.py new file mode 100644 index 00000000..ad8bbf8e --- /dev/null +++ b/mindee/dependencies/checkers.py @@ -0,0 +1,33 @@ +from mindee.error.mindee_dependency_error import MindeeDependencyError + +try: + import PIL # noqa: F401 #pylint: disable=unused-import + + PILLOW_AVAILABLE = True +except ImportError: + PILLOW_AVAILABLE = False + +try: + import pypdfium2 # noqa: F401 #pylint: disable=unused-import + + PYPDFIUM2_AVAILABLE = True +except ImportError: + PYPDFIUM2_AVAILABLE = False + + +def require_pillow() -> None: + """Raises a clear error if Pillow is not installed.""" + if not PILLOW_AVAILABLE: + raise MindeeDependencyError( + "This feature requires the 'Pillow' library. " + "Install it directly or run `pip install mindee` instead of `mindee-lite`." + ) + + +def require_pypdfium2() -> None: + """Raises a clear error if PyPDFium2 is not installed.""" + if not PYPDFIUM2_AVAILABLE: + raise MindeeDependencyError( + "This feature requires the 'PyPDFium2' library. " + "Install it directly or run `pip install mindee` instead of `mindee-lite`." + ) diff --git a/mindee/dependencies/decorators.py b/mindee/dependencies/decorators.py new file mode 100644 index 00000000..d204ae71 --- /dev/null +++ b/mindee/dependencies/decorators.py @@ -0,0 +1,30 @@ +import functools +from collections.abc import Callable +from typing import ParamSpec, TypeVar + +from mindee.dependencies.checkers import require_pillow, require_pypdfium2 + +P = ParamSpec("P") +R = TypeVar("R") + + +def requires_pillow(func: Callable[P, R]) -> Callable[P, R]: + """Decorator to enforce Pillow availability on a function/method.""" + + @functools.wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + require_pillow() + return func(*args, **kwargs) + + return wrapper + + +def requires_pypdfium2(func: Callable[P, R]) -> Callable[P, R]: + """Decorator to enforce PyPDFium2 availability on a function/method.""" + + @functools.wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + require_pypdfium2() + return func(*args, **kwargs) + + return wrapper diff --git a/mindee/error/mindee_dependency_error.py b/mindee/error/mindee_dependency_error.py new file mode 100644 index 00000000..0fd04d00 --- /dev/null +++ b/mindee/error/mindee_dependency_error.py @@ -0,0 +1,5 @@ +from mindee.error import MindeeError + + +class MindeeDependencyError(MindeeError, ImportError): + """An exception relating to missing dependencies.""" diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py index db76771d..6d0df276 100644 --- a/mindee/image/extracted_image.py +++ b/mindee/image/extracted_image.py @@ -1,13 +1,22 @@ +from __future__ import annotations + import io from pathlib import Path +from typing import Any -from PIL import Image - +from mindee.dependencies.checkers import PILLOW_AVAILABLE +from mindee.dependencies.decorators import requires_pillow from mindee.error.mindee_error import MindeeError from mindee.input.file_input import FileInput from mindee.input.local_input_source import LocalInputSource from mindee.logger import logger +if PILLOW_AVAILABLE: + # pylint: disable=import-error + from PIL import Image +else: + Image: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + class ExtractedImage: """Generic class for image extraction.""" @@ -45,6 +54,7 @@ def __init__( self._page_id = page_id self._element_id = 0 if element_id is None else element_id + @requires_pillow def save_to_file(self, output_path: Path | str, file_format: str | None = None): """ Saves the document to a file. diff --git a/mindee/image/image_compressor.py b/mindee/image/image_compressor.py index d2d75ab7..37241933 100644 --- a/mindee/image/image_compressor.py +++ b/mindee/image/image_compressor.py @@ -1,9 +1,19 @@ +from __future__ import annotations + import io -from typing import BinaryIO +from typing import Any, BinaryIO + +from mindee.dependencies.checkers import PILLOW_AVAILABLE +from mindee.dependencies.decorators import requires_pillow -from PIL import Image +if PILLOW_AVAILABLE: + # pylint: disable=import-error + from PIL import Image +else: + Image: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name +@requires_pillow def compress_image( image_buffer: BinaryIO | bytes, quality: int = 85, diff --git a/mindee/image/image_extractor.py b/mindee/image/image_extractor.py index 2dc04728..0a33168a 100644 --- a/mindee/image/image_extractor.py +++ b/mindee/image/image_extractor.py @@ -1,9 +1,11 @@ -import io -from typing import BinaryIO +from __future__ import annotations -import pypdfium2 as pdfium -from PIL import Image +import io +from typing import Any, BinaryIO +from mindee.dependencies import requires_pypdfium2 +from mindee.dependencies.checkers import PILLOW_AVAILABLE, PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pillow from mindee.error.mindee_error import MindeeError from mindee.geometry.point import Point from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y @@ -11,7 +13,22 @@ from mindee.input.bytes_input import BytesInput from mindee.input.local_input_source import LocalInputSource +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium +else: + pdfium = None # pylint: disable=invalid-name + + +if PILLOW_AVAILABLE: + # pylint: disable=import-error + from PIL import Image +else: + Image: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + +@requires_pillow +@requires_pypdfium2 def attach_image_as_new_file( # type: ignore input_buffer: BinaryIO, ) -> pdfium.PdfDocument: @@ -42,6 +59,7 @@ def attach_image_as_new_file( # type: ignore return pdf +@requires_pillow def extract_image_from_polygon( page_content: Image.Image, polygon: list[Point], @@ -72,6 +90,7 @@ def extract_image_from_polygon( return save_image_to_buffer(cropped_image, file_format) +@requires_pillow def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: """ Saves an image as a buffer. @@ -86,6 +105,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: return buffer.read() +@requires_pillow def determine_file_format(input_source: LocalInputSource) -> str: """ Retrieves the file format from an input source. @@ -111,6 +131,7 @@ def get_file_extension(file_format: str): return file_format.lower() if file_format != "JPEG" else "jpg" +@requires_pillow def extract_multiple_images_from_source( input_source: LocalInputSource, page_id: int, @@ -150,6 +171,7 @@ def extract_multiple_images_from_source( return extracted_elements +@requires_pypdfium2 def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore """ Loads a PDF document from a local input source. diff --git a/mindee/input/local_input_source.py b/mindee/input/local_input_source.py index cde4418a..e22fef8e 100644 --- a/mindee/input/local_input_source.py +++ b/mindee/input/local_input_source.py @@ -1,18 +1,26 @@ +from __future__ import annotations + import io import mimetypes import tempfile from collections.abc import Sequence from typing import BinaryIO -import pypdfium2 as pdfium - +from mindee.dependencies import requires_pypdfium2 +from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError from mindee.image import compress_image from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions from mindee.logger import logger from mindee.pdf.pdf_compressor import compress_pdf -from mindee.pdf.pdf_utils import has_source_text +from mindee.pdf.pdf_utils import pdf_has_source_text + +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium +else: + pdfium = None # pylint: disable=invalid-name mimetypes.add_type("image/heic", ".heic") mimetypes.add_type("image/heic", ".heif") @@ -42,18 +50,21 @@ def __init__(self) -> None: Initialize a LocalInputSource object. """ self._check_mimetype() - if self.is_pdf(): self.file_object.seek(0) - try: - pdf = pdfium.PdfDocument(self.file_object) - self.page_count = len(pdf) - except pdfium.PdfiumError as e: - logger.warning( - "Could not open PDF file: %s due to %s", self.filename, e - ) + # Some broken (yet fixable) PDFs can cause pdfium to crash on open. + if PYPDFIUM2_AVAILABLE: + try: + pdf = pdfium.PdfDocument(self.file_object) + self.page_count = len(pdf) + except pdfium.PdfiumError as e: + logger.warning( + "Could not open PDF file: %s due to %s", self.filename, e + ) + self.page_count = 0 + self.file_object.seek(0) + else: self.page_count = 0 - self.file_object.seek(0) else: self.page_count = 1 logger.debug( @@ -116,6 +127,7 @@ def is_pdf(self) -> bool: """:return: True if the file is a PDF.""" return self.file_mimetype == "application/pdf" + @requires_pypdfium2 def apply_page_options(self, page_options: PageOptions) -> None: """Apply cut and merge options on multipage documents.""" if not self.is_pdf(): @@ -165,6 +177,7 @@ def process_pdf( raise MindeeSourceError("Resulting PDF would have no pages left.") self.merge_pdf_pages(pages_to_keep) + @requires_pypdfium2 def merge_pdf_pages(self, page_numbers: set) -> None: """ Create a new PDF from pages and set it to ``file_object``. @@ -184,6 +197,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None: new_pdf.close() pdf.close() + @requires_pypdfium2 def is_pdf_empty(self) -> bool: """ Check if the PDF is empty. @@ -226,8 +240,9 @@ def has_source_text(self) -> bool: """ if not self.is_pdf(): return False - return has_source_text(self.file_object.read()) + return pdf_has_source_text(self.file_object.read()) + @requires_pypdfium2 def compress( self, quality: int = 85, diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py index 7ca6f7c0..15b724dc 100644 --- a/mindee/pdf/__init__.py +++ b/mindee/pdf/__init__.py @@ -2,14 +2,14 @@ from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import ( extract_text_from_pdf, - has_source_text, lerp, + pdf_has_source_text, ) __all__ = [ "PDFCharData", "compress_pdf", "extract_text_from_pdf", - "has_source_text", "lerp", + "pdf_has_source_text", ] diff --git a/mindee/pdf/extracted_pdf.py b/mindee/pdf/extracted_pdf.py index d9d89e78..21a0137f 100644 --- a/mindee/pdf/extracted_pdf.py +++ b/mindee/pdf/extracted_pdf.py @@ -1,11 +1,19 @@ +from __future__ import annotations + from pathlib import Path from typing import BinaryIO -import pypdfium2 as pdfium - +from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pypdfium2 from mindee.error.mindee_error import MindeeError from mindee.input.bytes_input import BytesInput +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium +else: + pdfium = None # pylint: disable=invalid-name + class ExtractedPDF: """An extracted sub-Pdf.""" @@ -17,6 +25,7 @@ def __init__(self, pdf_bytes: BinaryIO, filename: str): self.pdf_bytes = pdf_bytes self.filename = filename + @requires_pypdfium2 def get_page_count(self) -> int: """Get the number of pages in the PDF file.""" try: diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index cac26607..0df4c0fc 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -1,25 +1,40 @@ +from __future__ import annotations + import io import logging from ctypes import POINTER, c_char_p, c_ushort from threading import RLock -from typing import BinaryIO - -import pypdfium2 as pdfium -import pypdfium2.raw as pdfium_c -from PIL import Image +from typing import Any, BinaryIO -from mindee.image import compress_image +from mindee.dependencies.checkers import PILLOW_AVAILABLE, PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pillow, requires_pypdfium2 +from mindee.image.image_compressor import compress_image from mindee.pdf.pdf_char_data import PDFCharData from mindee.pdf.pdf_utils import ( extract_text_from_pdf, - has_source_text, lerp, + pdf_has_source_text, ) +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium + import pypdfium2.raw as pdfium_c +else: + pdfium: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + pdfium_c: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + +if PILLOW_AVAILABLE: + # pylint: disable=import-error + from PIL import Image +else: + Image: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + logger = logging.getLogger(__name__) MIN_QUALITY = 1 +@requires_pypdfium2 def compress_pdf( pdf_data: BinaryIO | bytes, image_quality: int = 85, @@ -41,7 +56,7 @@ def compress_pdf( else: pdf_bytes = pdf_data - if has_source_text(pdf_bytes): + if pdf_has_source_text(pdf_bytes): if force_source_text_compression: if not disable_source_text: logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.") @@ -111,6 +126,7 @@ def _compress_pdf_pages( return None +@requires_pypdfium2 def add_text_to_pdf_page( # type: ignore page: pdfium.PdfPage, page_id: int, @@ -146,6 +162,8 @@ def add_text_to_pdf_page( # type: ignore pdfium_c.FPDFPage_GenerateContent(page.raw) +@requires_pypdfium2 +@requires_pillow def _compress_pages_with_quality( pdf_data: bytes, image_quality: int, @@ -183,6 +201,7 @@ def _is_compression_successful( return total_compressed_size + total_compressed_size * overhead < original_size +@requires_pypdfium2 def _rasterize_page( # type: ignore page: pdfium.PdfPage, quality: int = 85, @@ -200,6 +219,7 @@ def _rasterize_page( # type: ignore return buffer.getvalue() +@requires_pypdfium2 def _collect_images_as_pdf(image_list: list[bytes]) -> pdfium.PdfDocument: # type: ignore """ Converts a list of JPEG images into pages in a PdfDocument. diff --git a/mindee/pdf/pdf_extractor.py b/mindee/pdf/pdf_extractor.py index 1c599c1a..3e081aa3 100644 --- a/mindee/pdf/pdf_extractor.py +++ b/mindee/pdf/pdf_extractor.py @@ -1,14 +1,27 @@ +from __future__ import annotations + import io from pathlib import Path -from typing import BinaryIO - -import pypdfium2 as pdfium -from PIL import Image +from typing import Any, BinaryIO +from mindee.dependencies.checkers import PILLOW_AVAILABLE, PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pillow, requires_pypdfium2 from mindee.error.mindee_error import MindeeError from mindee.input.local_input_source import LocalInputSource from mindee.pdf.extracted_pdf import ExtractedPDF +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium +else: + pdfium = None # pylint: disable=invalid-name + +if PILLOW_AVAILABLE: + # pylint: disable=import-error + from PIL import Image +else: + Image: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + class PDFExtractor: """PDF extraction class.""" @@ -16,6 +29,7 @@ class PDFExtractor: _source_pdf: BinaryIO _filename: str + @requires_pillow def __init__(self, local_input: LocalInputSource): self._filename = local_input.filename if local_input.is_pdf(): @@ -25,11 +39,13 @@ def __init__(self, local_input: LocalInputSource): self._source_pdf = io.BytesIO() pdf_image.save(self._source_pdf, format="PDF") + @requires_pypdfium2 def get_page_count(self) -> int: """Get the number of pages in the PDF file.""" pdf = pdfium.PdfDocument(self._source_pdf) return len(pdf) + @requires_pypdfium2 def cut_pages(self, page_indexes: list) -> BinaryIO: """ Create a new PDF from pages and save it into a buffer. @@ -45,6 +61,7 @@ def cut_pages(self, page_indexes: list) -> BinaryIO: new_pdf.save(bytes_io) return bytes_io + @requires_pypdfium2 def extract_sub_documents( self, page_indexes: list[list[int]] ) -> list[ExtractedPDF]: diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py index 129ead22..de7cd833 100644 --- a/mindee/pdf/pdf_utils.py +++ b/mindee/pdf/pdf_utils.py @@ -1,16 +1,28 @@ +from __future__ import annotations + import ctypes from ctypes import byref, c_double, c_int, create_string_buffer from threading import RLock +from typing import Any -import pypdfium2 as pdfium -import pypdfium2.raw as pdfium_c - +from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE +from mindee.dependencies.decorators import requires_pypdfium2 from mindee.pdf.pdf_char_data import PDFCharData +if PYPDFIUM2_AVAILABLE: + # pylint: disable=import-error + import pypdfium2 as pdfium + import pypdfium2.raw as pdfium_c +else: + pdfium: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + pdfium_c: Any = None # type: ignore[no-redef] # pylint: disable=invalid-name + + FALLBACK_FONT = "Helvetica" -def has_source_text(pdf_bytes: bytes) -> bool: +@requires_pypdfium2 +def pdf_has_source_text(pdf_bytes: bytes) -> bool: """ Checks if the provided PDF bytes contain source text. @@ -28,6 +40,7 @@ def has_source_text(pdf_bytes: bytes) -> bool: pdf.close() +@requires_pypdfium2 def extract_text_from_pdf(pdf_bytes: bytes) -> list[list[PDFCharData]]: """ Extracts the raw text from a given PDF's bytes along with font data. @@ -45,6 +58,7 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> list[list[PDFCharData]]: return char_data_list +@requires_pypdfium2 def _process_page(page, page_id: int, pdfium_lock: RLock) -> list[PDFCharData]: """ Processes a single page of the PDF. @@ -73,6 +87,7 @@ def _process_page(page, page_id: int, pdfium_lock: RLock) -> list[PDFCharData]: return char_data_list +@requires_pypdfium2 def _process_char( i: int, text_handler, @@ -131,6 +146,7 @@ def _process_char( return char_data_list +@requires_pypdfium2 def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: """ Retrieves information about a specific character. @@ -170,6 +186,7 @@ def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: } +@requires_pypdfium2 def _get_font_name(text_handler, i: int) -> str: """ Retrieves the font name for a specific character. @@ -189,6 +206,7 @@ def _get_font_name(text_handler, i: int) -> str: ) +@requires_pypdfium2 def _get_font_flags(text_handler, i: int) -> int: """ Retrieves the font flags for a specific character. @@ -202,6 +220,7 @@ def _get_font_flags(text_handler, i: int) -> int: return flags.value +@requires_pypdfium2 def _get_char_box( i: int, text_handler, pdfium_lock: RLock ) -> tuple[float, float, float, float]: @@ -221,6 +240,7 @@ def _get_char_box( return left.value, right.value, bottom.value, top.value +@requires_pypdfium2 def _get_page_rotation(page, pdfium_lock: RLock) -> int: """ Retrieves the rotation value for a specific page. diff --git a/mindee/v2/client_options/base_parameters.py b/mindee/v2/client_options/base_parameters.py index 8930142a..b6ad062e 100644 --- a/mindee/v2/client_options/base_parameters.py +++ b/mindee/v2/client_options/base_parameters.py @@ -1,5 +1,6 @@ from abc import ABC -from dataclasses import dataclass, field +from dataclasses import dataclass +from typing import ClassVar from mindee.client_options.polling_options import PollingOptions @@ -19,7 +20,7 @@ class BaseParameters(ABC): close_file: bool = True """Whether to close the file after product.""" - _slug: str = field(init=False) + _slug: ClassVar[str] """Slug of the endpoint.""" def get_form_data(self) -> dict[str, str | list[str]]: diff --git a/mindee/v2/parsing/inference/base_response.py b/mindee/v2/parsing/inference/base_response.py index 865bab55..e849452a 100644 --- a/mindee/v2/parsing/inference/base_response.py +++ b/mindee/v2/parsing/inference/base_response.py @@ -1,4 +1,5 @@ from abc import ABC +from typing import ClassVar from mindee.parsing.common.common_response import CommonResponse from mindee.v2.parsing.inference.base_inference import BaseInference @@ -9,7 +10,7 @@ class BaseResponse(ABC, CommonResponse): inference: BaseInference """The inference result for a split utility request""" - _slug: str + _slug: ClassVar[str] """Slug of the inference.""" def __str__(self) -> str: diff --git a/mindee/v2/product/classification/classification_inference.py b/mindee/v2/product/classification/classification_inference.py index 9e00ad60..2b2c37f1 100644 --- a/mindee/v2/product/classification/classification_inference.py +++ b/mindee/v2/product/classification/classification_inference.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_inference import BaseInference from mindee.v2.product.classification.classification_result import ClassificationResult @@ -8,7 +10,7 @@ class ClassificationInference(BaseInference): result: ClassificationResult """Result of a classification inference.""" - _slug: str = "classification" + _slug: ClassVar[str] = "classification" """Slug of the endpoint.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/classification/classification_response.py b/mindee/v2/product/classification/classification_response.py index c5e82c72..a628f7df 100644 --- a/mindee/v2/product/classification/classification_response.py +++ b/mindee/v2/product/classification/classification_response.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_response import BaseResponse from mindee.v2.product.classification.classification_inference import ( @@ -11,7 +13,7 @@ class ClassificationResponse(BaseResponse): inference: ClassificationInference """Inference object for classification inference.""" - _slug: str = "products/classification/results" + _slug: ClassVar[str] = "products/classification/results" """Slug of the inference.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/classification/params/classification_parameters.py b/mindee/v2/product/classification/params/classification_parameters.py index 8a69975f..1736d922 100644 --- a/mindee/v2/product/classification/params/classification_parameters.py +++ b/mindee/v2/product/classification/params/classification_parameters.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.v2.client_options.base_parameters import BaseParameters @@ -6,4 +8,4 @@ class ClassificationParameters(BaseParameters): Parameters accepted by the classification utility v2 endpoint. """ - _slug: str = "products/classification" + _slug: ClassVar[str] = "products/classification" diff --git a/mindee/v2/product/crop/crop_inference.py b/mindee/v2/product/crop/crop_inference.py index 6839e534..3b49837a 100644 --- a/mindee/v2/product/crop/crop_inference.py +++ b/mindee/v2/product/crop/crop_inference.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_inference import BaseInference from mindee.v2.product.crop.crop_result import CropResult @@ -9,7 +11,7 @@ class CropInference(BaseInference): result: CropResult """Result of a crop inference.""" - _slug: str = "crop" + _slug: ClassVar[str] = "crop" """Slug of the endpoint.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/crop/crop_response.py b/mindee/v2/product/crop/crop_response.py index 8b70cc19..2c2c771c 100644 --- a/mindee/v2/product/crop/crop_response.py +++ b/mindee/v2/product/crop/crop_response.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_response import BaseResponse from mindee.v2.product.crop.crop_inference import CropInference @@ -9,7 +11,7 @@ class CropResponse(BaseResponse): inference: CropInference """Inference object for crop inference.""" - _slug: str = "products/crop/results" + _slug: ClassVar[str] = "products/crop/results" """Slug of the inference.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/crop/params/crop_parameters.py b/mindee/v2/product/crop/params/crop_parameters.py index e46d17c9..a48449c1 100644 --- a/mindee/v2/product/crop/params/crop_parameters.py +++ b/mindee/v2/product/crop/params/crop_parameters.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.v2.client_options.base_parameters import BaseParameters @@ -6,4 +8,4 @@ class CropParameters(BaseParameters): Parameters accepted by the crop utility v2 endpoint. """ - _slug: str = "products/crop" + _slug: ClassVar[str] = "products/crop" diff --git a/mindee/v2/product/extraction/extraction_response.py b/mindee/v2/product/extraction/extraction_response.py index 3dac7e7a..b695dfe9 100644 --- a/mindee/v2/product/extraction/extraction_response.py +++ b/mindee/v2/product/extraction/extraction_response.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_response import BaseResponse from mindee.v2.product.extraction.extraction_inference import ExtractionInference @@ -8,7 +10,7 @@ class ExtractionResponse(BaseResponse): inference: ExtractionInference """Inference result.""" - _slug: str = "products/extraction/results" + _slug: ClassVar[str] = "products/extraction/results" """Slug of the inference.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/extraction/params/extraction_parameters.py b/mindee/v2/product/extraction/params/extraction_parameters.py index c71f3ee7..e7f265cf 100644 --- a/mindee/v2/product/extraction/params/extraction_parameters.py +++ b/mindee/v2/product/extraction/params/extraction_parameters.py @@ -1,5 +1,6 @@ import json from dataclasses import dataclass +from typing import ClassVar from mindee.v2.client_options.base_parameters import BaseParameters from mindee.v2.product.extraction.params.data_schema import DataSchema @@ -31,7 +32,7 @@ class ExtractionParameters(BaseParameters): Not recommended, for specific use only. """ - _slug: str = "inferences" + _slug: ClassVar[str] = "inferences" """Slug of the endpoint.""" def __post_init__(self): diff --git a/mindee/v2/product/ocr/ocr_inference.py b/mindee/v2/product/ocr/ocr_inference.py index 60eda267..47730a9d 100644 --- a/mindee/v2/product/ocr/ocr_inference.py +++ b/mindee/v2/product/ocr/ocr_inference.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_inference import BaseInference from mindee.v2.product.ocr.ocr_result import OCRResult @@ -8,7 +10,7 @@ class OCRInference(BaseInference): result: OCRResult """Result of a ocr inference.""" - _slug: str = "ocr" + _slug: ClassVar[str] = "ocr" """Slug of the endpoint.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/ocr/ocr_response.py b/mindee/v2/product/ocr/ocr_response.py index da21bfea..41dae2db 100644 --- a/mindee/v2/product/ocr/ocr_response.py +++ b/mindee/v2/product/ocr/ocr_response.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_response import BaseResponse from mindee.v2.product.ocr.ocr_inference import OCRInference @@ -9,7 +11,7 @@ class OCRResponse(BaseResponse): inference: OCRInference """Inference object for ocr inference.""" - _slug: str = "products/ocr/results" + _slug: ClassVar[str] = "products/ocr/results" """Slug of the inference.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/ocr/params/ocr_parameters.py b/mindee/v2/product/ocr/params/ocr_parameters.py index 5e8bb449..1edaf9b1 100644 --- a/mindee/v2/product/ocr/params/ocr_parameters.py +++ b/mindee/v2/product/ocr/params/ocr_parameters.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.v2.client_options.base_parameters import BaseParameters @@ -6,4 +8,4 @@ class OCRParameters(BaseParameters): Parameters accepted by the ocr utility v2 endpoint. """ - _slug: str = "products/ocr" + _slug: ClassVar[str] = "products/ocr" diff --git a/mindee/v2/product/split/params/split_parameters.py b/mindee/v2/product/split/params/split_parameters.py index 0b9036b2..f1dc0745 100644 --- a/mindee/v2/product/split/params/split_parameters.py +++ b/mindee/v2/product/split/params/split_parameters.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.v2.client_options.base_parameters import BaseParameters @@ -6,4 +8,4 @@ class SplitParameters(BaseParameters): Parameters accepted by the split utility v2 endpoint. """ - _slug: str = "products/split" + _slug: ClassVar[str] = "products/split" diff --git a/mindee/v2/product/split/split_inference.py b/mindee/v2/product/split/split_inference.py index 6e68bdc2..8365818f 100644 --- a/mindee/v2/product/split/split_inference.py +++ b/mindee/v2/product/split/split_inference.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_inference import BaseInference from mindee.v2.product.split.split_result import SplitResult @@ -9,7 +11,7 @@ class SplitInference(BaseInference): result: SplitResult """Result of a split inference.""" - _slug: str = "split" + _slug: ClassVar[str] = "split" """Slug of the endpoint.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/mindee/v2/product/split/split_response.py b/mindee/v2/product/split/split_response.py index ff8ace92..7163b7aa 100644 --- a/mindee/v2/product/split/split_response.py +++ b/mindee/v2/product/split/split_response.py @@ -1,3 +1,5 @@ +from typing import ClassVar + from mindee.parsing.common.string_dict import StringDict from mindee.v2.parsing.inference.base_response import BaseResponse from mindee.v2.product.split.split_inference import SplitInference @@ -9,7 +11,7 @@ class SplitResponse(BaseResponse): inference: SplitInference """Inference object for split inference.""" - _slug: str = "products/split/results" + _slug: ClassVar[str] = "products/split/results" """Slug of the inference.""" def __init__(self, raw_response: StringDict) -> None: diff --git a/pyproject.toml b/pyproject.toml index 14b2296d..c623aa8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,11 +55,12 @@ test = [ docs = [ "sphinx~=9.1.0", "sphinx_rtd_theme~=3.1.0", - "sphinx-autodoc-typehints~=3.10.4", + "sphinx-autodoc-typehints>=3.10.4,<4.0.0", ] build = [ "build", "twine", + "toml" ] [project.scripts] @@ -119,14 +120,17 @@ safe_licenses = [ [tool.pytest.ini_options] -addopts = "--pyargs --cov-report term:skip-covered --cov-report term-missing -m 'not regression and not integration'" +addopts = "--pyargs --cov-report term:skip-covered --cov-report term-missing -m 'not integration and not regression and not lite'" python_files = "test*.py" junit_family = "xunit2" markers = [ - "regression: marks tests as regression tests - select with '-m regression'", "lineitems: debug line items", "integration: integration tests that send calls to the API - select with '-m integration'", - "v2: Tests specific to version 2 of the API" + "lite: tests for mindee-lite", + "v2: tests specific to version 2 of the API", + "pillow: tests that require the use of the Pillow (PIL) library", + "pypdfium2: tests that require the usage of the pypdfium2 library", + "regression: marks tests as regression tests - select with '-m regression'", ] testpaths = [ "tests", diff --git a/scripts/generate_lite_toml.py b/scripts/generate_lite_toml.py new file mode 100644 index 00000000..c8778d6a --- /dev/null +++ b/scripts/generate_lite_toml.py @@ -0,0 +1,42 @@ +from typing import Any + +import toml + + +def generate_lite() -> None: + """Generates the mindee-lite version of pyproject.toml""" + with open("pyproject.toml", encoding="utf-8") as f: + data: dict[str, Any] = toml.load(f) + + data["project"]["name"] = "mindee-lite" + data["project"]["description"] = ( + "Mindee API helper library for Python (Lite Version)" + ) + + original_deps = data["project"]["dependencies"] + heavy_deps = [ + dep + for dep in original_deps + if str(dep).lower().startswith("pillow") + or str(dep).lower().startswith("pypdfium2") + ] + lite_deps = [ + dep + for dep in original_deps + if not str(dep).lower().startswith("pillow") + and not str(dep).lower().startswith("pypdfium2") + ] + data["project"]["optional-dependencies"]["heavy"] = heavy_deps + data["project"]["dependencies"] = lite_deps + data["tool"]["pytest"]["ini_options"]["addopts"] = data["tool"]["pytest"][ + "ini_options" + ]["addopts"].replace(" lite", " pypdfium2 and not pillow") + + with open("pyproject-lite.toml", "w", encoding="utf-8") as f: + toml.dump(data, f) + + print("Successfully generated pyproject-lite.toml") + + +if __name__ == "__main__": + generate_lite() diff --git a/tests/dependencies/__init__.py b/tests/dependencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/dependencies/test_dependencies.py b/tests/dependencies/test_dependencies.py new file mode 100644 index 00000000..d04b058c --- /dev/null +++ b/tests/dependencies/test_dependencies.py @@ -0,0 +1,23 @@ +import pytest + +from mindee.dependencies import PILLOW_AVAILABLE, PYPDFIUM2_AVAILABLE + + +@pytest.mark.pillow +def test_pillow_installed(): + assert PILLOW_AVAILABLE + + +@pytest.mark.pypdfium2 +def test_pypdfium2_installed(): + assert PYPDFIUM2_AVAILABLE + + +@pytest.mark.lite +def test_pillow_missing(): + assert not PILLOW_AVAILABLE + + +@pytest.mark.lite +def test_pypdfium2_missing(): + assert not PYPDFIUM2_AVAILABLE diff --git a/tests/input/test_apply_page_options.py b/tests/input/test_apply_page_options.py index 0f1c9550..8053b1c7 100644 --- a/tests/input/test_apply_page_options.py +++ b/tests/input/test_apply_page_options.py @@ -1,6 +1,5 @@ import io -import pypdfium2 as pdfium import pytest from mindee.error.mindee_error import MindeeError @@ -14,6 +13,8 @@ from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions from tests.utils import FILE_TYPES_DIR, V1_PRODUCT_DATA_DIR +pdfium = pytest.importorskip("pypdfium2") + def _assert_page_options(input_source: LocalInputSource, numb_pages: int): assert input_source.is_pdf() is True diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index 90a6ef47..6b4371ab 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -1,9 +1,10 @@ +from __future__ import annotations + import operator import os from functools import reduce import pytest -from PIL import Image from mindee.image import compress_image from mindee.input import PathInput @@ -17,6 +18,8 @@ cleanup_output_files, ) +Image = pytest.importorskip("PIL.Image") + RECEIPT_PATH = FILE_TYPES_DIR / "receipt.jpg" diff --git a/tests/input/test_fix_pdf.py b/tests/input/test_fix_pdf.py index 5025abcc..ba258872 100644 --- a/tests/input/test_fix_pdf.py +++ b/tests/input/test_fix_pdf.py @@ -5,12 +5,14 @@ from tests.utils import FILE_TYPES_DIR +@pytest.mark.pypdfium2 def test_broken_unfixable_pdf(): input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf") with pytest.raises(MimeTypeError): input_source.fix_pdf() +@pytest.mark.pypdfium2 def test_broken_fixable_pdf(): input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf") input_source.fix_pdf() diff --git a/tests/input/test_inputs.py b/tests/input/test_inputs.py index ef74bb0d..5acb4129 100644 --- a/tests/input/test_inputs.py +++ b/tests/input/test_inputs.py @@ -26,6 +26,7 @@ def test_pdf_read_contents(): assert input_source.file_object.closed +@pytest.mark.pypdfium2 @pytest.mark.parametrize( ("filename", "page_count"), [ @@ -65,12 +66,14 @@ def _assert_image(input_source: LocalInputSource, mimetype: str) -> None: assert isinstance(input_source.file_object.read(15), bytes) +@pytest.mark.pypdfium2 @pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES) def test_image_input_from_path(filename, mimetype): input_source = PathInput(FILE_TYPES_DIR / filename) _assert_image(input_source, mimetype) +@pytest.mark.pypdfium2 @pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES) def test_image_input_from_file(filename, mimetype): with open(FILE_TYPES_DIR / filename, "rb") as fp: @@ -78,6 +81,7 @@ def test_image_input_from_file(filename, mimetype): _assert_image(input_source, mimetype) +@pytest.mark.pypdfium2 @pytest.mark.parametrize(("filename", "mimetype"), TEST_IMAGES) def test_image_input_from_bytes(filename, mimetype): with open(FILE_TYPES_DIR / filename, "rb") as file_bytes: @@ -85,12 +89,14 @@ def test_image_input_from_bytes(filename, mimetype): _assert_image(input_source, mimetype) +@pytest.mark.pypdfium2 def test_image_input_from_base64(): with open(FILE_TYPES_DIR / "receipt.txt") as fp: input_source = Base64Input(fp.read(), filename="receipt.jpg") _assert_image(input_source, mimetype="image/jpeg") +@pytest.mark.pypdfium2 def test_txt_input_from_path(): with pytest.raises(MimeTypeError): PathInput(FILE_TYPES_DIR / "receipt.txt") diff --git a/tests/v1/extraction/test_image_extractor.py b/tests/v1/extraction/test_image_extractor.py index 11036b21..b63435ba 100644 --- a/tests/v1/extraction/test_image_extractor.py +++ b/tests/v1/extraction/test_image_extractor.py @@ -1,13 +1,16 @@ +from __future__ import annotations + import json import pytest -from PIL import Image from mindee.image.image_extractor import extract_multiple_images_from_source from mindee.input.path_input import PathInput from mindee.v1.product.barcode_reader import BarcodeReaderV1 from tests.utils import V1_PRODUCT_DATA_DIR +Image = pytest.importorskip("PIL.Image") + @pytest.fixture def barcode_path(): diff --git a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py index c66d6255..8c4d46c5 100644 --- a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py @@ -27,6 +27,8 @@ def prepare_invoice_return(rst_file_path: Path, invoice_prediction: Document): return rst_content +@pytest.mark.pillow +@pytest.mark.pypdfium2 @pytest.mark.integration def test_pdf_should_extract_invoices_strict(): client = Client() diff --git a/tests/v1/extraction/test_multi_receipts_extractor.py b/tests/v1/extraction/test_multi_receipts_extractor.py index f1f4650c..06fd0184 100644 --- a/tests/v1/extraction/test_multi_receipts_extractor.py +++ b/tests/v1/extraction/test_multi_receipts_extractor.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import json import pytest -from PIL import Image from mindee.input.path_input import PathInput from mindee.v1.pdf.multi_receipts_extractor import extract_receipts @@ -10,6 +11,8 @@ ) from tests.utils import V1_PRODUCT_DATA_DIR +Image = pytest.importorskip("PIL.Image") + @pytest.fixture def multi_receipts_single_page_path(): diff --git a/tests/v1/extraction/test_pdf_extractor.py b/tests/v1/extraction/test_pdf_extractor.py index ce42cd5a..f4871ac6 100644 --- a/tests/v1/extraction/test_pdf_extractor.py +++ b/tests/v1/extraction/test_pdf_extractor.py @@ -33,6 +33,8 @@ def loaded_prediction(): return prediction +@pytest.mark.pillow +@pytest.mark.pypdfium2 def test_image_should_extract_pdf(invoice_default_sample_path): jpg_input = PathInput(invoice_default_sample_path) assert not jpg_input.is_pdf() @@ -40,6 +42,8 @@ def test_image_should_extract_pdf(invoice_default_sample_path): assert extractor.get_page_count() == 1 +@pytest.mark.pillow +@pytest.mark.pypdfium2 def test_pdf_should_extract_invoices_no_strict( invoice_splitter_5p_path, loaded_prediction ): @@ -61,6 +65,8 @@ def test_pdf_should_extract_invoices_no_strict( assert extracted_pdfs_no_strict[2].filename == "invoice_5p_005-005.pdf" +@pytest.mark.pillow +@pytest.mark.pypdfium2 def test_pdf_should_extract_invoices_strict( invoice_splitter_5p_path, loaded_prediction ): diff --git a/tests/v1/test_client.py b/tests/v1/test_client.py index f5620f91..def0a66b 100644 --- a/tests/v1/test_client.py +++ b/tests/v1/test_client.py @@ -99,6 +99,7 @@ def test_keep_file_open(dummy_client: Client): assert input_doc.file_object.closed +@pytest.mark.pypdfium2 def test_cut_options(dummy_client: Client): input_doc: LocalInputSource = PathInput(f"{FILE_TYPES_DIR}/pdf/multipage.pdf") with contextlib.suppress(MindeeHTTPError): diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py index e484b7c6..22fe809a 100644 --- a/tests/v2/file_operations/test_crop_operation.py +++ b/tests/v2/file_operations/test_crop_operation.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import json import pytest -from PIL import Image from mindee.input.path_input import PathInput from mindee.v2.file_operations.crop import extract_multiple_crops @@ -10,6 +11,8 @@ ) from tests.utils import V2_PRODUCT_DATA_DIR +Image = pytest.importorskip("PIL.Image") + @pytest.fixture def crops_single_page_path(): @@ -31,6 +34,8 @@ def crops_multi_page_json_path(): return V2_PRODUCT_DATA_DIR / "crop" / "crop_multiple.json" +@pytest.mark.pillow +@pytest.mark.pypdfium2 def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path): input_sample = PathInput(crops_single_page_path) with open(crops_single_page_json_path, "rb") as f: @@ -45,7 +50,9 @@ def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_p assert image_buffer_0.size == (2823, 1571) -def test_multi_page_receipt_split(crops_multi_page_path, crops_multi_page_json_path): +@pytest.mark.pillow +@pytest.mark.pypdfium2 +def test_multi_page_receipt_crop(crops_multi_page_path, crops_multi_page_json_path): input_sample = PathInput(crops_multi_page_path) with open(crops_multi_page_json_path, "rb") as f: response = json.load(f) diff --git a/tests/v2/file_operations/test_crop_operation_integration.py b/tests/v2/file_operations/test_crop_operation_integration.py index 69036957..ea44ac65 100644 --- a/tests/v2/file_operations/test_crop_operation_integration.py +++ b/tests/v2/file_operations/test_crop_operation_integration.py @@ -25,6 +25,8 @@ def check_findoc_return(findoc_response: ExtractionResponse): assert findoc_response.inference.result.fields.get("total_amount").value > 0 +@pytest.mark.pillow +@pytest.mark.pypdfium2 @pytest.mark.integration def test_image_should_extract_crops(): client = Client() @@ -55,8 +57,8 @@ def test_image_should_extract_crops(): extracted_images.save_all_to_disk(OUTPUT_DIR) crop1size = os.path.getsize(OUTPUT_DIR / "crop_001.jpg") crop2size = os.path.getsize(OUTPUT_DIR / "crop_002.jpg") - assert 186699 <= crop1size <= 199685 - assert 194103 <= crop2size <= 199433 + assert 180000 <= crop1size <= 199685 + assert 190000 <= crop2size <= 199433 @pytest.fixture(scope="module", autouse=True) diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index 1971a6ad..e7cf3ddd 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -31,7 +31,8 @@ def splits_multi_page_json_path(): return V2_PRODUCT_DATA_DIR / "split" / "split_multiple.json" -def test_single_page_split_split(splits_default, splits_single_page_json_path): +@pytest.mark.pypdfium2 +def test_single_page_split(splits_default, splits_single_page_json_path): input_sample = PathInput(splits_default) with open(splits_single_page_json_path, "rb") as f: response = json.load(f) @@ -42,6 +43,7 @@ def test_single_page_split_split(splits_default, splits_single_page_json_path): assert extracted_splits[0].get_page_count() == 1 +@pytest.mark.pypdfium2 def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): input_sample = PathInput(splits_5p) with open(splits_multi_page_json_path, "rb") as f: diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index 9c9f8831..1f604ea8 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -23,6 +23,7 @@ def check_findoc_return(findoc_response: ExtractionResponse): assert findoc_response.inference.result.fields.get("total_amount").value > 0 +@pytest.mark.pypdfium2 @pytest.mark.integration def test_pdf_should_extract_splits(): client = Client()