fix: provide a fallback for PyMuPDF (#4564)

* add a fallback xpdf alternative to PyMuPDF * add xpdpf to the base images * to be reverted * silence mypy on conditional error * do not install pdf extras in base images * bring back the xpdf build strategy * remove leftovers from old build * fix indentation * Apply suggestions from code review Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * revert test workflow --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2025-12-13 07:47:26 +00:00 · 2023-03-31 14:37:05 +02:00 · 2023-03-31 14:37:05 +02:00 · 322652c306
commit 322652c306
parent 57415ef8ab
7 changed files with 281 additions and 5 deletions
--- a/.github/workflows/xpdf_release.yml
+++ b/.github/workflows/xpdf_release.yml
@ -0,0 +1,39 @@
+name: Xpdf Docker image release
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - docker/docker-bake-xpdf.hcl
+      - docker/Dockerfile.xpdf
+
+jobs:
+  publish-xpdf-image:
+    runs-on: ubuntu-latest
+    env:
+      DOCKER_REPO_NAME: deepset/xpdf
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKER_HUB_USER }}
+          password: ${{ secrets.DOCKER_HUB_TOKEN }}
+
+      - name: Build and publish Xpdf image
+        uses: docker/bake-action@v2
+        with:
+          files: "docker-bake-xpdf.hcl"
+          workdir: docker
+          targets: xpdf
+          push: true
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@ -28,6 +28,10 @@ RUN pip install --upgrade pip && \
 FROM $base_image AS final

 COPY --from=build-image /opt/venv /opt/venv
+COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
+
+# pdftotext requires fontconfig runtime
+RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*

 ENV PATH="/opt/venv/bin:$PATH"

--- a/docker/Dockerfile.xpdf
+++ b/docker/Dockerfile.xpdf
@ -0,0 +1,23 @@
+FROM ubuntu:latest
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    gcc \
+    git \
+    libtesseract-dev \
+    poppler-utils \
+    tesseract-ocr
+
+ARG xpdf_version
+RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \
+    tar -xvf xpdf-${xpdf_version}.tar.gz && \
+    cd xpdf-${xpdf_version} && \
+    cmake . && \
+    make && \
+    cp xpdf/pdftotext /opt && \
+    cd .. \
+    rm -rf xpdf-${xpdf_version}
--- a/docker/docker-bake-xpdf.hcl
+++ b/docker/docker-bake-xpdf.hcl
@ -0,0 +1,12 @@
+variable "XPDF_VERSION" {
+    default = "4.04"
+}
+
+target "xpdf" {
+    dockerfile = "Dockerfile.xpdf"
+    tags = ["deepset/xpdf:latest"]
+    args = {
+        xpdf_version = "${XPDF_VERSION}"
+    }
+    platforms = ["linux/amd64", "linux/arm64"]
+}
--- a/docker/docker-bake.hcl
+++ b/docker/docker-bake.hcl
@ -45,7 +45,7 @@ target "base-cpu" {
    build_image = "python:3.10-slim"
    base_image = "python:3.10-slim"
    haystack_version = "${HAYSTACK_VERSION}"
-    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,pdf,ocr,onnx,beir]"
+    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
  }
  platforms = ["linux/amd64", "linux/arm64"]
 }
@ -59,7 +59,7 @@ target "base-gpu" {
    build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
    base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
    haystack_version = "${HAYSTACK_VERSION}"
-    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,pdf,ocr,onnx-gpu]"
+    haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu]"
  }
  platforms = ["linux/amd64", "linux/arm64"]
 }
--- a/haystack/nodes/file_converter/init.py
+++ b/haystack/nodes/file_converter/init.py
@ -20,6 +20,9 @@ ImageToTextConverter = safe_import(
 PDFToTextOCRConverter = safe_import(
    "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
 )  # Has optional dependencies
-PDFToTextConverter = safe_import(
-    "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "pdf"
-)  # Has optional dependencies
+
+# Try to use PyMuPDF, if not available fall back to xpdf
+try:
+    from haystack.nodes.file_converter.pdf import PDFToTextConverter
+except ImportError:
+    from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter  # type: ignore
--- a/haystack/nodes/file_converter/pdf_xpdf.py
+++ b/haystack/nodes/file_converter/pdf_xpdf.py
@ -0,0 +1,195 @@
+import logging
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from haystack.nodes.file_converter.base import BaseConverter
+from haystack.schema import Document
+
+logger = logging.getLogger(__name__)
+
+
+class PDFToTextConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+        id_hash_keys: Optional[List[str]] = None,
+        encoding: Optional[str] = "UTF-8",
+        keep_physical_layout: bool = False,
+    ):
+        """
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
+        :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
+                         Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
+                         (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
+        """
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
+        )
+        try:
+            subprocess.run(["pdftotext", "-v"], shell=False, check=False)
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
+
+                   Installation on Linux:
+                   wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz &&
+                   tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
+
+                   Installation on MacOS:
+                   brew install xpdf
+
+                   You can find more details here: https://www.xpdfreader.com
+                """
+            )
+
+        self.encoding = encoding
+        self.keep_physical_layout = keep_physical_layout
+
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, Any]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = None,
+        id_hash_keys: Optional[List[str]] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+    ) -> List[Document]:
+        """
+        Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
+
+        :param file_path: Path to the .pdf file you want to convert
+        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
+                     Can be any custom keys and values.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
+                         (See list of available encodings by running `pdftotext -listenc` in the terminal)
+        :param keep_physical_layout: This option will maintain original physical layout on the extracted text.
+            It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
+        """
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+        if id_hash_keys is None:
+            id_hash_keys = self.id_hash_keys
+
+        keep_physical_layout = self.keep_physical_layout
+
+        pages = self._read_pdf(
+            file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
+        )
+
+        cleaned_pages = []
+        for page in pages:
+            # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
+            # can be toggled by using the layout param.
+            #  layout=True
+            #      + table structures get retained better
+            #      - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
+            #  layout=False
+            #      + keeps strings in content stream order, hence multi column layout works well
+            #      - cells of tables gets split across line
+            #
+            #  Here, as a "safe" default, layout is turned off.
+            lines = page.splitlines()
+            cleaned_lines = []
+            for line in lines:
+                words = line.split()
+                digits = [word for word in words if any(i.isdigit() for i in word)]
+
+                # remove lines having > 40% of words as digits AND not ending with a period(.)
+                if remove_numeric_tables:
+                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
+                        logger.debug("Removing line '%s' from %s", line, file_path)
+                        continue
+                cleaned_lines.append(line)
+
+            page = "\n".join(cleaned_lines)
+            cleaned_pages.append(page)
+
+        if valid_languages:
+            document_text = "".join(cleaned_pages)
+            if not self.validate_language(document_text, valid_languages):
+                logger.warning(
+                    "The language for %s is not one of %s. The file may not have "
+                    "been decoded in the correct text format.",
+                    file_path,
+                    valid_languages,
+                )
+
+        text = "\f".join(cleaned_pages)
+        document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
+        return [document]
+
+    def _read_pdf(
+        self,
+        file_path: Path,
+        layout: bool,
+        encoding: Optional[str] = None,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+    ) -> List[str]:
+        """
+        Extract pages from the pdf file at file_path.
+
+        :param file_path: path of the pdf file
+        :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
+                       the content stream order.
+        :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
+                         (See list of available encodings by running `pdftotext -listenc` in the terminal)
+        :param start_page: The page number where to start the conversion
+        :param end_page: The page number where to end the conversion.
+        """
+        if not encoding:
+            encoding = self.encoding
+
+        start_page = start_page or 1
+
+        command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]
+
+        if end_page is not None:
+            command.extend(["-l", str(end_page)])
+
+        command.extend([str(file_path), "-"])
+
+        output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
+        document = output.stdout.decode(errors="ignore")
+        document = "\f" * (start_page - 1) + document  # tracking skipped pages for correct page numbering
+        pages = document.split("\f")
+        pages = pages[:-1]  # the last page in the split is always empty.
+
+        return pages