diff --git a/.github/workflows/xpdf_release.yml b/.github/workflows/xpdf_release.yml new file mode 100644 index 000000000..88f6df7b6 --- /dev/null +++ b/.github/workflows/xpdf_release.yml @@ -0,0 +1,39 @@ +name: Xpdf Docker image release + +on: + push: + branches: + - main + paths: + - docker/docker-bake-xpdf.hcl + - docker/Dockerfile.xpdf + +jobs: + publish-xpdf-image: + runs-on: ubuntu-latest + env: + DOCKER_REPO_NAME: deepset/xpdf + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_HUB_USER }} + password: ${{ secrets.DOCKER_HUB_TOKEN }} + + - name: Build and publish Xpdf image + uses: docker/bake-action@v2 + with: + files: "docker-bake-xpdf.hcl" + workdir: docker + targets: xpdf + push: true diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 5b9ff6ca9..25f647bf9 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -28,6 +28,10 @@ RUN pip install --upgrade pip && \ FROM $base_image AS final COPY --from=build-image /opt/venv /opt/venv +COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin + +# pdftotext requires fontconfig runtime +RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/* ENV PATH="/opt/venv/bin:$PATH" diff --git a/docker/Dockerfile.xpdf b/docker/Dockerfile.xpdf new file mode 100644 index 000000000..16361cd8d --- /dev/null +++ b/docker/Dockerfile.xpdf @@ -0,0 +1,23 @@ +FROM ubuntu:latest + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + gcc \ + git \ + libtesseract-dev \ + poppler-utils \ + tesseract-ocr + +ARG xpdf_version +RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \ + tar -xvf xpdf-${xpdf_version}.tar.gz && \ + cd xpdf-${xpdf_version} && \ + cmake . && \ + make && \ + cp xpdf/pdftotext /opt && \ + cd .. \ + rm -rf xpdf-${xpdf_version} diff --git a/docker/docker-bake-xpdf.hcl b/docker/docker-bake-xpdf.hcl new file mode 100644 index 000000000..caab2dcfd --- /dev/null +++ b/docker/docker-bake-xpdf.hcl @@ -0,0 +1,12 @@ +variable "XPDF_VERSION" { + default = "4.04" +} + +target "xpdf" { + dockerfile = "Dockerfile.xpdf" + tags = ["deepset/xpdf:latest"] + args = { + xpdf_version = "${XPDF_VERSION}" + } + platforms = ["linux/amd64", "linux/arm64"] +} diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl index ee7586170..1566275f1 100644 --- a/docker/docker-bake.hcl +++ b/docker/docker-bake.hcl @@ -45,7 +45,7 @@ target "base-cpu" { build_image = "python:3.10-slim" base_image = "python:3.10-slim" haystack_version = "${HAYSTACK_VERSION}" - haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,pdf,ocr,onnx,beir]" + haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]" } platforms = ["linux/amd64", "linux/arm64"] } @@ -59,7 +59,7 @@ target "base-gpu" { build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime" base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime" haystack_version = "${HAYSTACK_VERSION}" - haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,pdf,ocr,onnx-gpu]" + haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu]" } platforms = ["linux/amd64", "linux/arm64"] } diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py index 4514a144a..07232bd42 100644 --- a/haystack/nodes/file_converter/__init__.py +++ b/haystack/nodes/file_converter/__init__.py @@ -20,6 +20,9 @@ ImageToTextConverter = safe_import( PDFToTextOCRConverter = safe_import( "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr" ) # Has optional dependencies -PDFToTextConverter = safe_import( - "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "pdf" -) # Has optional dependencies + +# Try to use PyMuPDF, if not available fall back to xpdf +try: + from haystack.nodes.file_converter.pdf import PDFToTextConverter +except ImportError: + from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter # type: ignore diff --git a/haystack/nodes/file_converter/pdf_xpdf.py b/haystack/nodes/file_converter/pdf_xpdf.py new file mode 100644 index 000000000..5d8a32f14 --- /dev/null +++ b/haystack/nodes/file_converter/pdf_xpdf.py @@ -0,0 +1,195 @@ +import logging +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Optional + +from haystack.nodes.file_converter.base import BaseConverter +from haystack.schema import Document + +logger = logging.getLogger(__name__) + + +class PDFToTextConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8", + keep_physical_layout: bool = False, + ): + """ + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + :param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`. + Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...). + (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal) + :param keep_physical_layout: This option will maintain original physical layout on the extracted text. + It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. + """ + super().__init__( + remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys + ) + try: + subprocess.run(["pdftotext", "-v"], shell=False, check=False) + except FileNotFoundError: + raise FileNotFoundError( + """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. + + Installation on Linux: + wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && + tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + + Installation on MacOS: + brew install xpdf + + You can find more details here: https://www.xpdfreader.com + """ + ) + + self.encoding = encoding + self.keep_physical_layout = keep_physical_layout + + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, Any]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, + ) -> List[Document]: + """ + Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) + + :param file_path: Path to the .pdf file you want to convert + :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. + Can be any custom keys and values. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. + (See list of available encodings by running `pdftotext -listenc` in the terminal) + :param keep_physical_layout: This option will maintain original physical layout on the extracted text. + It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. + """ + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + if id_hash_keys is None: + id_hash_keys = self.id_hash_keys + + keep_physical_layout = self.keep_physical_layout + + pages = self._read_pdf( + file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page + ) + + cleaned_pages = [] + for page in pages: + # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour + # can be toggled by using the layout param. + # layout=True + # + table structures get retained better + # - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line + # layout=False + # + keeps strings in content stream order, hence multi column layout works well + # - cells of tables gets split across line + # + # Here, as a "safe" default, layout is turned off. + lines = page.splitlines() + cleaned_lines = [] + for line in lines: + words = line.split() + digits = [word for word in words if any(i.isdigit() for i in word)] + + # remove lines having > 40% of words as digits AND not ending with a period(.) + if remove_numeric_tables: + if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): + logger.debug("Removing line '%s' from %s", line, file_path) + continue + cleaned_lines.append(line) + + page = "\n".join(cleaned_lines) + cleaned_pages.append(page) + + if valid_languages: + document_text = "".join(cleaned_pages) + if not self.validate_language(document_text, valid_languages): + logger.warning( + "The language for %s is not one of %s. The file may not have " + "been decoded in the correct text format.", + file_path, + valid_languages, + ) + + text = "\f".join(cleaned_pages) + document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) + return [document] + + def _read_pdf( + self, + file_path: Path, + layout: bool, + encoding: Optional[str] = None, + start_page: Optional[int] = None, + end_page: Optional[int] = None, + ) -> List[str]: + """ + Extract pages from the pdf file at file_path. + + :param file_path: path of the pdf file + :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in + the content stream order. + :param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. + (See list of available encodings by running `pdftotext -listenc` in the terminal) + :param start_page: The page number where to start the conversion + :param end_page: The page number where to end the conversion. + """ + if not encoding: + encoding = self.encoding + + start_page = start_page or 1 + + command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)] + + if end_page is not None: + command.extend(["-l", str(end_page)]) + + command.extend([str(file_path), "-"]) + + output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False) + document = output.stdout.decode(errors="ignore") + document = "\f" * (start_page - 1) + document # tracking skipped pages for correct page numbering + pages = document.split("\f") + pages = pages[:-1] # the last page in the split is always empty. + + return pages