feat: add converter based on pdfminer (#7607)

* Initial commit pdfminer converter

* Revert back naming of argument all_text per pdfminer documentation

* Add the component decorator

* Add release notes

* Reformat code with black

* Remove LTPage and comments

* Update dependencies in pyproject.toml

* Added some tests and incorporated reference doc in docstring

* Added some tests and incorporated reference doc in docstring
This commit is contained in:
Mo 2024-05-02 03:36:54 -05:00 committed by GitHub
parent 2509eeea7e
commit 2e35f13085
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 292 additions and 0 deletions

View File

@ -3,6 +3,7 @@ from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
from haystack.components.converters.output_adapter import OutputAdapter
from haystack.components.converters.pdfminer import PDFMinerToDocument
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.components.converters.tika import TikaDocumentConverter
from haystack.components.converters.txt import TextFileToDocument
@ -12,6 +13,7 @@ __all__ = [
"TikaDocumentConverter",
"AzureOCRDocumentConverter",
"PyPDFToDocument",
"PDFMinerToDocument",
"HTMLToDocument",
"MarkdownToDocument",
"OpenAPIServiceToFunctions",

View File

@ -0,0 +1,160 @@
import io
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LAParams, LTTextContainer
logger = logging.getLogger(__name__)
@component
class PDFMinerToDocument:
"""
Converts PDF files to Documents.
Uses `pdfminer` compatible converters to convert PDF files to Documents. https://pdfminersix.readthedocs.io/en/latest/
Usage example:
```python
from haystack.components.converters.pdfminer import PDFMinerToDocument
converter = PDFMinerToDocument()
results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.'
```
"""
def __init__(
self,
line_overlap: float = 0.5,
char_margin: float = 2.0,
line_margin: float = 0.5,
word_margin: float = 0.1,
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = True,
all_texts: bool = False,
) -> None:
"""
Create a PDFMinerToDocument component.
:param line_overlap:
This parameter determines whether two characters are considered to be on
the same line based on the amount of overlap between them.
The overlap is calculated relative to the minimum height of both characters.
:param char_margin:
Determines whether two characters are part of the same line based on the distance between them.
If the distance is less than the margin specified, the characters are considered to be on the same line.
The margin is calculated relative to the width of the character.
:param word_margin:
Determines whether two characters on the same line are part of the same word
based on the distance between them. If the distance is greater than the margin specified,
an intermediate space will be added between them to make the text more readable.
The margin is calculated relative to the width of the character.
:param line_margin:
This parameter determines whether two lines are part of the same paragraph based on
the distance between them. If the distance is less than the margin specified,
the lines are considered to be part of the same paragraph.
The margin is calculated relative to the height of a line.
:param boxes_flow:
This parameter determines the importance of horizontal and vertical position when
determining the order of text boxes. A value between -1.0 and +1.0 can be set,
with -1.0 indicating that only horizontal position matters and +1.0 indicating
that only vertical position matters. Setting the value to 'None' will disable advanced
layout analysis, and text boxes will be ordered based on the position of their bottom left corner.
:param detect_vertical:
This parameter determines whether vertical text should be considered during layout analysis.
:param all_texts:
If layout analysis should be performed on text in figures.
"""
pdfminer_import.check()
self.layout_params = LAParams(
line_overlap=line_overlap,
char_margin=char_margin,
line_margin=line_margin,
word_margin=word_margin,
boxes_flow=boxes_flow,
detect_vertical=detect_vertical,
all_texts=all_texts,
)
def __converter(self, extractor) -> Document:
"""
Extracts text from PDF pages then convert the text into Documents
:param extractor:
Python generator that yields PDF pages.
:returns:
PDF text converted to Haystack Document
"""
pages = []
for page in extractor:
text = ""
for container in page:
# Keep text only
if isinstance(container, LTTextContainer):
text += container.get_text()
pages.append(text)
# Add a page delimiter
concat = "\f".join(pages)
return Document(content=concat)
@component.output_types(document=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts PDF files to Documents.
:param sources:
List of PDF file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
documents = []
meta_list = normalize_metadata(meta, sources_count=len(sources))
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
document = self.__converter(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue
merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)
return {"documents": documents}

View File

@ -109,6 +109,7 @@ extra-dependencies = [
# Converters
"pypdf", # PyPDFConverter
"pdfminer.six", # PDFMinerToDocument
"markdown-it-py", # MarkdownToDocument
"mdit_plain", # MarkdownToDocument
"tika", # TikaDocumentConverter

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Provides users the ability to customize text extraction from PDF files. It is particularly useful for PDFs with unusual layouts, such as those containing multiple text columns. For instance, users can configure the object to retain the reading order.

View File

@ -0,0 +1,125 @@
import logging
import pytest
from haystack.dataclasses import ByteStream
from haystack.components.converters.pdfminer import PDFMinerToDocument
class TestPDFMinerToDocument:
def test_run(self, test_files_path):
"""
Test if the component runs correctly.
"""
converter = PDFMinerToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
results = converter.run(sources=sources)
docs = results["documents"]
assert len(docs) == 1
for doc in docs:
assert "the page 3 is empty" in doc.content
assert "Page 4 of Sample PDF" in doc.content
def test_init_params_custom(self, test_files_path):
"""
Test if init arguments are passed successfully to PDFMinerToDocument layout parameters
"""
converter = PDFMinerToDocument(char_margin=0.5, all_texts=True)
assert converter.layout_params.char_margin == 0.5
assert converter.layout_params.all_texts is True
def test_run_wrong_file_type(self, test_files_path, caplog):
"""
Test if the component runs correctly when an input file is not of the expected type.
"""
sources = [test_files_path / "audio" / "answer.wav"]
converter = PDFMinerToDocument()
with caplog.at_level(logging.WARNING):
output = converter.run(sources=sources)
assert "Is this really a PDF?" in caplog.text
docs = output["documents"]
assert not docs
def test_arg_is_none(self, test_files_path):
"""
Test if the component runs correctly when an argument is None.
"""
converter = PDFMinerToDocument(char_margin=None)
assert converter.layout_params.char_margin is None
def test_run_doc_metadata(self, test_files_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
"""
converter = PDFMinerToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
metadata = [{"file_name": "sample_pdf_2.pdf"}]
results = converter.run(sources=sources, meta=metadata)
docs = results["documents"]
assert len(docs) == 1
assert "Ward Cunningham" in docs[0].content
assert docs[0].meta["file_name"] == "sample_pdf_2.pdf"
def test_incorrect_meta(self, test_files_path):
"""
Test if the component raises an error when incorrect metadata is supplied by the user.
"""
converter = PDFMinerToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_3.pdf"]
metadata = [{"file_name": "sample_pdf_3.pdf"}, {"file_name": "sample_pdf_2.pdf"}]
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
converter.run(sources=sources, meta=metadata)
def test_run_bytestream_metadata(self, test_files_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object.
"""
converter = PDFMinerToDocument()
with open(test_files_path / "pdf" / "sample_pdf_2.pdf", "rb") as file:
byte_stream = file.read()
stream = ByteStream(byte_stream, meta={"content_type": "text/pdf", "url": "test_url"})
results = converter.run(sources=[stream])
docs = results["documents"]
assert len(docs) == 1
assert "Ward Cunningham" in docs[0].content
assert docs[0].meta == {"content_type": "text/pdf", "url": "test_url"}
def test_run_bytestream_doc_overlapping_metadata(self, test_files_path):
"""
Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
There is an overlap between the metadata received.
The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
"""
converter = PDFMinerToDocument()
with open(test_files_path / "pdf" / "sample_pdf_2.pdf", "rb") as file:
byte_stream = file.read()
# ByteStream has "url" present in metadata
stream = ByteStream(byte_stream, meta={"content_type": "text/pdf", "url": "test_url_correct"})
# "url" supplied by the user overwrites value present in metadata
metadata = [{"file_name": "sample_pdf_2.pdf", "url": "test_url_new"}]
results = converter.run(sources=[stream], meta=metadata)
docs = results["documents"]
assert len(docs) == 1
assert "Ward Cunningham" in docs[0].content
assert docs[0].meta == {"file_name": "sample_pdf_2.pdf", "content_type": "text/pdf", "url": "test_url_new"}
def test_run_error_handling(self, caplog):
"""
Test if the component correctly handles errors.
"""
sources = ["non_existing_file.pdf"]
converter = PDFMinerToDocument()
with caplog.at_level(logging.WARNING):
results = converter.run(sources=sources)
assert "Could not read non_existing_file.pdf" in caplog.text
assert results["documents"] == []