mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-18 18:49:04 +00:00
fix: provide a fallback for PyMuPDF (#4564)
* add a fallback xpdf alternative to PyMuPDF * add xpdpf to the base images * to be reverted * silence mypy on conditional error * do not install pdf extras in base images * bring back the xpdf build strategy * remove leftovers from old build * fix indentation * Apply suggestions from code review Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * revert test workflow --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
57415ef8ab
commit
322652c306
39
.github/workflows/xpdf_release.yml
vendored
Normal file
39
.github/workflows/xpdf_release.yml
vendored
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
name: Xpdf Docker image release
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
paths:
|
||||||
|
- docker/docker-bake-xpdf.hcl
|
||||||
|
- docker/Dockerfile.xpdf
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish-xpdf-image:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
DOCKER_REPO_NAME: deepset/xpdf
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
|
- name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_HUB_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_HUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and publish Xpdf image
|
||||||
|
uses: docker/bake-action@v2
|
||||||
|
with:
|
||||||
|
files: "docker-bake-xpdf.hcl"
|
||||||
|
workdir: docker
|
||||||
|
targets: xpdf
|
||||||
|
push: true
|
||||||
@ -28,6 +28,10 @@ RUN pip install --upgrade pip && \
|
|||||||
FROM $base_image AS final
|
FROM $base_image AS final
|
||||||
|
|
||||||
COPY --from=build-image /opt/venv /opt/venv
|
COPY --from=build-image /opt/venv /opt/venv
|
||||||
|
COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
|
||||||
|
|
||||||
|
# pdftotext requires fontconfig runtime
|
||||||
|
RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
ENV PATH="/opt/venv/bin:$PATH"
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
|
||||||
|
|||||||
23
docker/Dockerfile.xpdf
Normal file
23
docker/Dockerfile.xpdf
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
ca-certificates \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
gcc \
|
||||||
|
git \
|
||||||
|
libtesseract-dev \
|
||||||
|
poppler-utils \
|
||||||
|
tesseract-ocr
|
||||||
|
|
||||||
|
ARG xpdf_version
|
||||||
|
RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \
|
||||||
|
tar -xvf xpdf-${xpdf_version}.tar.gz && \
|
||||||
|
cd xpdf-${xpdf_version} && \
|
||||||
|
cmake . && \
|
||||||
|
make && \
|
||||||
|
cp xpdf/pdftotext /opt && \
|
||||||
|
cd .. \
|
||||||
|
rm -rf xpdf-${xpdf_version}
|
||||||
12
docker/docker-bake-xpdf.hcl
Normal file
12
docker/docker-bake-xpdf.hcl
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
variable "XPDF_VERSION" {
|
||||||
|
default = "4.04"
|
||||||
|
}
|
||||||
|
|
||||||
|
target "xpdf" {
|
||||||
|
dockerfile = "Dockerfile.xpdf"
|
||||||
|
tags = ["deepset/xpdf:latest"]
|
||||||
|
args = {
|
||||||
|
xpdf_version = "${XPDF_VERSION}"
|
||||||
|
}
|
||||||
|
platforms = ["linux/amd64", "linux/arm64"]
|
||||||
|
}
|
||||||
@ -45,7 +45,7 @@ target "base-cpu" {
|
|||||||
build_image = "python:3.10-slim"
|
build_image = "python:3.10-slim"
|
||||||
base_image = "python:3.10-slim"
|
base_image = "python:3.10-slim"
|
||||||
haystack_version = "${HAYSTACK_VERSION}"
|
haystack_version = "${HAYSTACK_VERSION}"
|
||||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,pdf,ocr,onnx,beir]"
|
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
|
||||||
}
|
}
|
||||||
platforms = ["linux/amd64", "linux/arm64"]
|
platforms = ["linux/amd64", "linux/arm64"]
|
||||||
}
|
}
|
||||||
@ -59,7 +59,7 @@ target "base-gpu" {
|
|||||||
build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||||
base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||||
haystack_version = "${HAYSTACK_VERSION}"
|
haystack_version = "${HAYSTACK_VERSION}"
|
||||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,pdf,ocr,onnx-gpu]"
|
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu]"
|
||||||
}
|
}
|
||||||
platforms = ["linux/amd64", "linux/arm64"]
|
platforms = ["linux/amd64", "linux/arm64"]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -20,6 +20,9 @@ ImageToTextConverter = safe_import(
|
|||||||
PDFToTextOCRConverter = safe_import(
|
PDFToTextOCRConverter = safe_import(
|
||||||
"haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
|
"haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
|
||||||
) # Has optional dependencies
|
) # Has optional dependencies
|
||||||
PDFToTextConverter = safe_import(
|
|
||||||
"haystack.nodes.file_converter.pdf", "PDFToTextConverter", "pdf"
|
# Try to use PyMuPDF, if not available fall back to xpdf
|
||||||
) # Has optional dependencies
|
try:
|
||||||
|
from haystack.nodes.file_converter.pdf import PDFToTextConverter
|
||||||
|
except ImportError:
|
||||||
|
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter # type: ignore
|
||||||
|
|||||||
195
haystack/nodes/file_converter/pdf_xpdf.py
Normal file
195
haystack/nodes/file_converter/pdf_xpdf.py
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from haystack.nodes.file_converter.base import BaseConverter
|
||||||
|
from haystack.schema import Document
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PDFToTextConverter(BaseConverter):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
remove_numeric_tables: bool = False,
|
||||||
|
valid_languages: Optional[List[str]] = None,
|
||||||
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
encoding: Optional[str] = "UTF-8",
|
||||||
|
keep_physical_layout: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||||
|
The tabular structures in documents might be noise for the reader model if it
|
||||||
|
does not have table parsing capability for finding answers. However, tables
|
||||||
|
may also have long strings that could possible candidate for searching answers.
|
||||||
|
The rows containing strings are thus retained in this option.
|
||||||
|
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||||
|
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||||
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
|
in garbled text.
|
||||||
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||||
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
:param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
|
||||||
|
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
|
||||||
|
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
|
||||||
|
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
|
||||||
|
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
subprocess.run(["pdftotext", "-v"], shell=False, check=False)
|
||||||
|
except FileNotFoundError:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
|
||||||
|
|
||||||
|
Installation on Linux:
|
||||||
|
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz &&
|
||||||
|
tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
|
||||||
|
|
||||||
|
Installation on MacOS:
|
||||||
|
brew install xpdf
|
||||||
|
|
||||||
|
You can find more details here: https://www.xpdfreader.com
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
self.encoding = encoding
|
||||||
|
self.keep_physical_layout = keep_physical_layout
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_path: Path,
|
||||||
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
|
valid_languages: Optional[List[str]] = None,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
start_page: Optional[int] = None,
|
||||||
|
end_page: Optional[int] = None,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||||
|
|
||||||
|
:param file_path: Path to the .pdf file you want to convert
|
||||||
|
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||||
|
Can be any custom keys and values.
|
||||||
|
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||||
|
The tabular structures in documents might be noise for the reader model if it
|
||||||
|
does not have table parsing capability for finding answers. However, tables
|
||||||
|
may also have long strings that could possible candidate for searching answers.
|
||||||
|
The rows containing strings are thus retained in this option.
|
||||||
|
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||||
|
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||||
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
|
in garbled text.
|
||||||
|
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||||
|
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||||
|
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
|
||||||
|
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||||
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||||
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
:param start_page: The page number where to start the conversion
|
||||||
|
:param end_page: The page number where to end the conversion.
|
||||||
|
"""
|
||||||
|
if remove_numeric_tables is None:
|
||||||
|
remove_numeric_tables = self.remove_numeric_tables
|
||||||
|
if valid_languages is None:
|
||||||
|
valid_languages = self.valid_languages
|
||||||
|
if id_hash_keys is None:
|
||||||
|
id_hash_keys = self.id_hash_keys
|
||||||
|
|
||||||
|
keep_physical_layout = self.keep_physical_layout
|
||||||
|
|
||||||
|
pages = self._read_pdf(
|
||||||
|
file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
|
||||||
|
)
|
||||||
|
|
||||||
|
cleaned_pages = []
|
||||||
|
for page in pages:
|
||||||
|
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
|
||||||
|
# can be toggled by using the layout param.
|
||||||
|
# layout=True
|
||||||
|
# + table structures get retained better
|
||||||
|
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
|
||||||
|
# layout=False
|
||||||
|
# + keeps strings in content stream order, hence multi column layout works well
|
||||||
|
# - cells of tables gets split across line
|
||||||
|
#
|
||||||
|
# Here, as a "safe" default, layout is turned off.
|
||||||
|
lines = page.splitlines()
|
||||||
|
cleaned_lines = []
|
||||||
|
for line in lines:
|
||||||
|
words = line.split()
|
||||||
|
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||||
|
|
||||||
|
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||||
|
if remove_numeric_tables:
|
||||||
|
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||||
|
logger.debug("Removing line '%s' from %s", line, file_path)
|
||||||
|
continue
|
||||||
|
cleaned_lines.append(line)
|
||||||
|
|
||||||
|
page = "\n".join(cleaned_lines)
|
||||||
|
cleaned_pages.append(page)
|
||||||
|
|
||||||
|
if valid_languages:
|
||||||
|
document_text = "".join(cleaned_pages)
|
||||||
|
if not self.validate_language(document_text, valid_languages):
|
||||||
|
logger.warning(
|
||||||
|
"The language for %s is not one of %s. The file may not have "
|
||||||
|
"been decoded in the correct text format.",
|
||||||
|
file_path,
|
||||||
|
valid_languages,
|
||||||
|
)
|
||||||
|
|
||||||
|
text = "\f".join(cleaned_pages)
|
||||||
|
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||||
|
return [document]
|
||||||
|
|
||||||
|
def _read_pdf(
|
||||||
|
self,
|
||||||
|
file_path: Path,
|
||||||
|
layout: bool,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
start_page: Optional[int] = None,
|
||||||
|
end_page: Optional[int] = None,
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract pages from the pdf file at file_path.
|
||||||
|
|
||||||
|
:param file_path: path of the pdf file
|
||||||
|
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
||||||
|
the content stream order.
|
||||||
|
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||||
|
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||||
|
:param start_page: The page number where to start the conversion
|
||||||
|
:param end_page: The page number where to end the conversion.
|
||||||
|
"""
|
||||||
|
if not encoding:
|
||||||
|
encoding = self.encoding
|
||||||
|
|
||||||
|
start_page = start_page or 1
|
||||||
|
|
||||||
|
command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]
|
||||||
|
|
||||||
|
if end_page is not None:
|
||||||
|
command.extend(["-l", str(end_page)])
|
||||||
|
|
||||||
|
command.extend([str(file_path), "-"])
|
||||||
|
|
||||||
|
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
|
||||||
|
document = output.stdout.decode(errors="ignore")
|
||||||
|
document = "\f" * (start_page - 1) + document # tracking skipped pages for correct page numbering
|
||||||
|
pages = document.split("\f")
|
||||||
|
pages = pages[:-1] # the last page in the split is always empty.
|
||||||
|
|
||||||
|
return pages
|
||||||
Loading…
x
Reference in New Issue
Block a user