mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-13 07:47:26 +00:00
fix: provide a fallback for PyMuPDF (#4564)
* add a fallback xpdf alternative to PyMuPDF * add xpdpf to the base images * to be reverted * silence mypy on conditional error * do not install pdf extras in base images * bring back the xpdf build strategy * remove leftovers from old build * fix indentation * Apply suggestions from code review Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * revert test workflow --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
57415ef8ab
commit
322652c306
39
.github/workflows/xpdf_release.yml
vendored
Normal file
39
.github/workflows/xpdf_release.yml
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
name: Xpdf Docker image release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- docker/docker-bake-xpdf.hcl
|
||||
- docker/Dockerfile.xpdf
|
||||
|
||||
jobs:
|
||||
publish-xpdf-image:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
DOCKER_REPO_NAME: deepset/xpdf
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USER }}
|
||||
password: ${{ secrets.DOCKER_HUB_TOKEN }}
|
||||
|
||||
- name: Build and publish Xpdf image
|
||||
uses: docker/bake-action@v2
|
||||
with:
|
||||
files: "docker-bake-xpdf.hcl"
|
||||
workdir: docker
|
||||
targets: xpdf
|
||||
push: true
|
||||
@ -28,6 +28,10 @@ RUN pip install --upgrade pip && \
|
||||
FROM $base_image AS final
|
||||
|
||||
COPY --from=build-image /opt/venv /opt/venv
|
||||
COPY --from=deepset/xpdf:latest /opt/pdftotext /usr/local/bin
|
||||
|
||||
# pdftotext requires fontconfig runtime
|
||||
RUN apt-get update && apt-get install -y libfontconfig && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
|
||||
23
docker/Dockerfile.xpdf
Normal file
23
docker/Dockerfile.xpdf
Normal file
@ -0,0 +1,23 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
gcc \
|
||||
git \
|
||||
libtesseract-dev \
|
||||
poppler-utils \
|
||||
tesseract-ocr
|
||||
|
||||
ARG xpdf_version
|
||||
RUN curl -O https://dl.xpdfreader.com/xpdf-${xpdf_version}.tar.gz && \
|
||||
tar -xvf xpdf-${xpdf_version}.tar.gz && \
|
||||
cd xpdf-${xpdf_version} && \
|
||||
cmake . && \
|
||||
make && \
|
||||
cp xpdf/pdftotext /opt && \
|
||||
cd .. \
|
||||
rm -rf xpdf-${xpdf_version}
|
||||
12
docker/docker-bake-xpdf.hcl
Normal file
12
docker/docker-bake-xpdf.hcl
Normal file
@ -0,0 +1,12 @@
|
||||
variable "XPDF_VERSION" {
|
||||
default = "4.04"
|
||||
}
|
||||
|
||||
target "xpdf" {
|
||||
dockerfile = "Dockerfile.xpdf"
|
||||
tags = ["deepset/xpdf:latest"]
|
||||
args = {
|
||||
xpdf_version = "${XPDF_VERSION}"
|
||||
}
|
||||
platforms = ["linux/amd64", "linux/arm64"]
|
||||
}
|
||||
@ -45,7 +45,7 @@ target "base-cpu" {
|
||||
build_image = "python:3.10-slim"
|
||||
base_image = "python:3.10-slim"
|
||||
haystack_version = "${HAYSTACK_VERSION}"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,pdf,ocr,onnx,beir]"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
|
||||
}
|
||||
platforms = ["linux/amd64", "linux/arm64"]
|
||||
}
|
||||
@ -59,7 +59,7 @@ target "base-gpu" {
|
||||
build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||
base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||
haystack_version = "${HAYSTACK_VERSION}"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,pdf,ocr,onnx-gpu]"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu]"
|
||||
}
|
||||
platforms = ["linux/amd64", "linux/arm64"]
|
||||
}
|
||||
|
||||
@ -20,6 +20,9 @@ ImageToTextConverter = safe_import(
|
||||
PDFToTextOCRConverter = safe_import(
|
||||
"haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
|
||||
) # Has optional dependencies
|
||||
PDFToTextConverter = safe_import(
|
||||
"haystack.nodes.file_converter.pdf", "PDFToTextConverter", "pdf"
|
||||
) # Has optional dependencies
|
||||
|
||||
# Try to use PyMuPDF, if not available fall back to xpdf
|
||||
try:
|
||||
from haystack.nodes.file_converter.pdf import PDFToTextConverter
|
||||
except ImportError:
|
||||
from haystack.nodes.file_converter.pdf_xpdf import PDFToTextConverter # type: ignore
|
||||
|
||||
195
haystack/nodes/file_converter/pdf_xpdf.py
Normal file
195
haystack/nodes/file_converter/pdf_xpdf.py
Normal file
@ -0,0 +1,195 @@
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from haystack.nodes.file_converter.base import BaseConverter
|
||||
from haystack.schema import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFToTextConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
keep_physical_layout: bool = False,
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param encoding: Encoding that will be passed as `-enc` parameter to `pdftotext`.
|
||||
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
|
||||
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
|
||||
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
|
||||
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||
"""
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||
)
|
||||
try:
|
||||
subprocess.run(["pdftotext", "-v"], shell=False, check=False)
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(
|
||||
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
|
||||
|
||||
Installation on Linux:
|
||||
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz &&
|
||||
tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
|
||||
|
||||
Installation on MacOS:
|
||||
brew install xpdf
|
||||
|
||||
You can find more details here: https://www.xpdfreader.com
|
||||
"""
|
||||
)
|
||||
|
||||
self.encoding = encoding
|
||||
self.keep_physical_layout = keep_physical_layout
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
start_page: Optional[int] = None,
|
||||
end_page: Optional[int] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
|
||||
:param file_path: Path to the .pdf file you want to convert
|
||||
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
|
||||
Can be any custom keys and values.
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
:param keep_physical_layout: This option will maintain original physical layout on the extracted text.
|
||||
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param start_page: The page number where to start the conversion
|
||||
:param end_page: The page number where to end the conversion.
|
||||
"""
|
||||
if remove_numeric_tables is None:
|
||||
remove_numeric_tables = self.remove_numeric_tables
|
||||
if valid_languages is None:
|
||||
valid_languages = self.valid_languages
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
|
||||
keep_physical_layout = self.keep_physical_layout
|
||||
|
||||
pages = self._read_pdf(
|
||||
file_path, layout=keep_physical_layout, encoding=encoding, start_page=start_page, end_page=end_page
|
||||
)
|
||||
|
||||
cleaned_pages = []
|
||||
for page in pages:
|
||||
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
|
||||
# can be toggled by using the layout param.
|
||||
# layout=True
|
||||
# + table structures get retained better
|
||||
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
|
||||
# layout=False
|
||||
# + keeps strings in content stream order, hence multi column layout works well
|
||||
# - cells of tables gets split across line
|
||||
#
|
||||
# Here, as a "safe" default, layout is turned off.
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(i.isdigit() for i in word)]
|
||||
|
||||
# remove lines having > 40% of words as digits AND not ending with a period(.)
|
||||
if remove_numeric_tables:
|
||||
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
|
||||
logger.debug("Removing line '%s' from %s", line, file_path)
|
||||
continue
|
||||
cleaned_lines.append(line)
|
||||
|
||||
page = "\n".join(cleaned_lines)
|
||||
cleaned_pages.append(page)
|
||||
|
||||
if valid_languages:
|
||||
document_text = "".join(cleaned_pages)
|
||||
if not self.validate_language(document_text, valid_languages):
|
||||
logger.warning(
|
||||
"The language for %s is not one of %s. The file may not have "
|
||||
"been decoded in the correct text format.",
|
||||
file_path,
|
||||
valid_languages,
|
||||
)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
return [document]
|
||||
|
||||
def _read_pdf(
|
||||
self,
|
||||
file_path: Path,
|
||||
layout: bool,
|
||||
encoding: Optional[str] = None,
|
||||
start_page: Optional[int] = None,
|
||||
end_page: Optional[int] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Extract pages from the pdf file at file_path.
|
||||
|
||||
:param file_path: path of the pdf file
|
||||
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
|
||||
the content stream order.
|
||||
:param encoding: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
:param start_page: The page number where to start the conversion
|
||||
:param end_page: The page number where to end the conversion.
|
||||
"""
|
||||
if not encoding:
|
||||
encoding = self.encoding
|
||||
|
||||
start_page = start_page or 1
|
||||
|
||||
command = ["pdftotext", "-enc", str(encoding), "-layout" if layout else "-raw", "-f", str(start_page)]
|
||||
|
||||
if end_page is not None:
|
||||
command.extend(["-l", str(end_page)])
|
||||
|
||||
command.extend([str(file_path), "-"])
|
||||
|
||||
output = subprocess.run(command, stdout=subprocess.PIPE, shell=False, check=False)
|
||||
document = output.stdout.decode(errors="ignore")
|
||||
document = "\f" * (start_page - 1) + document # tracking skipped pages for correct page numbering
|
||||
pages = document.split("\f")
|
||||
pages = pages[:-1] # the last page in the split is always empty.
|
||||
|
||||
return pages
|
||||
Loading…
x
Reference in New Issue
Block a user