fix: provide a fallback for PyMuPDF (#4564)

* add a fallback xpdf alternative to PyMuPDF * add xpdpf to the base images * to be reverted * silence mypy on conditional error * do not install pdf extras in base images * bring back the xpdf build strategy * remove leftovers from old build * fix indentation * Apply suggestions from code review Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * revert test workflow --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2025-12-18 18:49:04 +00:00 · 2023-03-31 14:37:05 +02:00 · 2023-03-31 14:37:05 +02:00 · 322652c306
commit 322652c306
parent 57415ef8ab
7 changed files with 281 additions and 5 deletions
--- a/.github/workflows/xpdf_release.yml
+++ b/.github/workflows/xpdf_release.yml
@ -0,0 +1,39 @@
 name: Xpdf Docker image release
 on:
  push:
    branches:
      - main
    paths:
      - docker/docker-bake-xpdf.hcl
      - docker/Dockerfile.xpdf
 jobs:
  publish-xpdf-image:
    runs-on: ubuntu-latest
    env:
      DOCKER_REPO_NAME: deepset/xpdf
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKER_HUB_USER }}
          password: ${{ secrets.DOCKER_HUB_TOKEN }}
      - name: Build and publish Xpdf image
        uses: docker/bake-action@v2
        with:
          files: "docker-bake-xpdf.hcl"
          workdir: docker
          targets: xpdf
          push: true
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@ -28,6 +28,10 @@ RUN pip install --upgrade pip && \
 FROM $base_image AS final
 COPY --from=build-image /opt/venv /opt/venv
 COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
 # pdftotext requires fontconfig runtime
 RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*
 ENV PATH="/opt/venv/bin:$PATH"
--- a/docker/Dockerfile.xpdf
+++ b/docker/Dockerfile.xpdf
@ -0,0 +1,23 @@
 FROM ubuntu:latest
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    build-essential \
    ca-certificates \
    cmake \
    curl \
    gcc \
    git \
    libtesseract-dev \
    poppler-utils \
    tesseract-ocr
 ARG xpdf_version
 RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \
    tar -xvf xpdf-${xpdf_version}.tar.gz && \
    cd xpdf-${xpdf_version} && \
    cmake . && \
    make && \
    cp xpdf/pdftotext /opt && \
    cd .. \
    rm -rf xpdf-${xpdf_version}
--- a/docker/docker-bake-xpdf.hcl
+++ b/docker/docker-bake-xpdf.hcl
@ -0,0 +1,12 @@
 variable "XPDF_VERSION" {
    default = "4.04"
 }
 target "xpdf" {
    dockerfile = "Dockerfile.xpdf"
    tags = ["deepset/xpdf:latest"]
    args = {
        xpdf_version = "${XPDF_VERSION}"
    }
    platforms = ["linux/amd64", "linux/arm64"]
 }
--- a/docker/docker-bake.hcl
+++ b/docker/docker-bake.hcl
@ -45,7 +45,7 @@ target "base-cpu" {
    build_image = "python:3.10-slim"
    base_image = "python:3.10-slim"
    haystack_version = "${HAYSTACK_VERSION}"
-    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,pdf,ocr,onnx,beir]"
+    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
  }
  platforms = ["linux/amd64", "linux/arm64"]
 }
@ -59,7 +59,7 @@ target "base-gpu" {
    build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
    base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
    haystack_version = "${HAYSTACK_VERSION}"
-    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,pdf,ocr,onnx-gpu]"
+    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu]"
  }
  platforms = ["linux/amd64", "linux/arm64"]
 }
--- a/haystack/nodes/file_converter/init.py
+++ b/haystack/nodes/file_converter/init.py
@ -20,6 +20,9 @@ ImageToTextConverter = safe_import(
 PDFToTextOCRConverter = safe_import(
    "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
 )  # Has optional dependencies
-PDFToTextConverter = safe_import(
+
-    "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "pdf"
+# Try to use PyMuPDF, if not available fall back to xpdf
-)  # Has optional dependencies
+try:
    from haystack.nodes.file_converter.pdf import PDFToTextConverter
 except ImportError:
    from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter  # type: ignore
--- a/haystack/nodes/file_converter/pdf_xpdf.py
+++ b/haystack/nodes/file_converter/pdf_xpdf.py
@ -0,0 +1,195 @@
 import logging
 import subprocess
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from haystack.nodes.file_converter.base import BaseConverter
 from haystack.schema import Document
 logger = logging.getLogger(__name__)
 class PDFToTextConverter(BaseConverter):
    def __init__(
        self,
        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = None,
        id_hash_keys: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        keep_physical_layout: bool = False,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
                         Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
                         (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
        """
        super().__init__(
            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
        )
        try:
            subprocess.run(["pdftotext", "-v"], shell=False, check=False)
        except FileNotFoundError:
            raise FileNotFoundError(
                """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
                   Installation on Linux:
                   wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz &&
                   tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
                   Installation on MacOS:
                   brew install xpdf
                   You can find more details here: https://www.xpdfreader.com
                """
            )
        self.encoding = encoding
        self.keep_physical_layout = keep_physical_layout
    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, Any]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = None,
        id_hash_keys: Optional[List[str]] = None,
        start_page: Optional[int] = None,
        end_page: Optional[int] = None,
    ) -> List[Document]:
        """
        Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
        :param file_path: Path to the .pdf file you want to convert
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                         (See list of available encodings by running `pdftotext -listenc` in the terminal)
        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        :param start_page: The page number where to start the conversion
        :param end_page: The page number where to end the conversion.
        """
        if remove_numeric_tables is None:
            remove_numeric_tables = self.remove_numeric_tables
        if valid_languages is None:
            valid_languages = self.valid_languages
        if id_hash_keys is None:
            id_hash_keys = self.id_hash_keys
        keep_physical_layout = self.keep_physical_layout
        pages = self._read_pdf(
            file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
        )
        cleaned_pages = []
        for page in pages:
            # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
            # can be toggled by using the layout param.
            #  layout=True
            #      + table structures get retained better
            #      - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
            #  layout=False
            #      + keeps strings in content stream order, hence multi column layout works well
            #      - cells of tables gets split across line
            #
            #  Here, as a "safe" default, layout is turned off.
            lines = page.splitlines()
            cleaned_lines = []
            for line in lines:
                words = line.split()
                digits = [word for word in words if any(i.isdigit() for i in word)]
                # remove lines having > 40% of words as digits AND not ending with a period(.)
                if remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug("Removing line '%s' from %s", line, file_path)
                        continue
                cleaned_lines.append(line)
            page = "\n".join(cleaned_lines)
            cleaned_pages.append(page)
        if valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text, valid_languages):
                logger.warning(
                    "The language for %s is not one of %s. The file may not have "
                    "been decoded in the correct text format.",
                    file_path,
                    valid_languages,
                )
        text = "\f".join(cleaned_pages)
        document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
        return [document]
    def _read_pdf(
        self,
        file_path: Path,
        layout: bool,
        encoding: Optional[str] = None,
        start_page: Optional[int] = None,
        end_page: Optional[int] = None,
    ) -> List[str]:
        """
        Extract pages from the pdf file at file_path.
        :param file_path: path of the pdf file
        :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
                       the content stream order.
        :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
                         (See list of available encodings by running `pdftotext -listenc` in the terminal)
        :param start_page: The page number where to start the conversion
        :param end_page: The page number where to end the conversion.
        """
        if not encoding:
            encoding = self.encoding
        start_page = start_page or 1
        command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]
        if end_page is not None:
            command.extend(["-l", str(end_page)])
        command.extend([str(file_path), "-"])
        output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
        document = output.stdout.decode(errors="ignore")
        document = "\f" * (start_page - 1) + document  # tracking skipped pages for correct page numbering
        pages = document.split("\f")
        pages = pages[:-1]  # the last page in the split is always empty.
        return pages