feat: add converter based on pdfminer (#7607)

* Initial commit pdfminer converter * Revert back naming of argument all_text per pdfminer documentation * Add the component decorator * Add release notes * Reformat code with black * Remove LTPage and comments * Update dependencies in pyproject.toml * Added some tests and incorporated reference doc in docstring * Added some tests and incorporated reference doc in docstring
2025-12-25 05:58:57 +00:00 · 2024-05-02 03:36:54 -05:00 · 2024-05-02 03:36:54 -05:00 · 2e35f13085
commit 2e35f13085
parent 2509eeea7e
5 changed files with 292 additions and 0 deletions
--- a/haystack/components/converters/init.py
+++ b/haystack/components/converters/init.py
@ -3,6 +3,7 @@ from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
 from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
 from haystack.components.converters.output_adapter import OutputAdapter
+from haystack.components.converters.pdfminer import PDFMinerToDocument
 from haystack.components.converters.pypdf import PyPDFToDocument
 from haystack.components.converters.tika import TikaDocumentConverter
 from haystack.components.converters.txt import TextFileToDocument
@ -12,6 +13,7 @@ __all__ = [
    "TikaDocumentConverter",
    "AzureOCRDocumentConverter",
    "PyPDFToDocument",
+    "PDFMinerToDocument",
    "HTMLToDocument",
    "MarkdownToDocument",
    "OpenAPIServiceToFunctions",
--- a/haystack/components/converters/pdfminer.py
+++ b/haystack/components/converters/pdfminer.py
@ -0,0 +1,160 @@
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
+    from pdfminer.high_level import extract_pages
+    from pdfminer.layout import LAParams, LTTextContainer
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PDFMinerToDocument:
+    """
+    Converts PDF files to Documents.
+
+    Uses `pdfminer` compatible converters to convert PDF files to Documents. https://pdfminersix.readthedocs.io/en/latest/
+
+    Usage example:
+    ```python
+    from haystack.components.converters.pdfminer import PDFMinerToDocument
+
+    converter = PDFMinerToDocument()
+    results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'This is a text from the PDF file.'
+    ```
+    """
+
+    def __init__(
+        self,
+        line_overlap: float = 0.5,
+        char_margin: float = 2.0,
+        line_margin: float = 0.5,
+        word_margin: float = 0.1,
+        boxes_flow: Optional[float] = 0.5,
+        detect_vertical: bool = True,
+        all_texts: bool = False,
+    ) -> None:
+        """
+        Create a PDFMinerToDocument component.
+
+        :param line_overlap:
+            This parameter determines whether two characters are considered to be on
+            the same line based on the amount of overlap between them.
+            The overlap is calculated relative to the minimum height of both characters.
+        :param char_margin:
+            Determines whether two characters are part of the same line based on the distance between them.
+            If the distance is less than the margin specified, the characters are considered to be on the same line.
+            The margin is calculated relative to the width of the character.
+        :param word_margin:
+            Determines whether two characters on the same line are part of the same word
+            based on the distance between them. If the distance is greater than the margin specified,
+            an intermediate space will be added between them to make the text more readable.
+            The margin is calculated relative to the width of the character.
+        :param line_margin:
+            This parameter determines whether two lines are part of the same paragraph based on
+            the distance between them. If the distance is less than the margin specified,
+            the lines are considered to be part of the same paragraph.
+            The margin is calculated relative to the height of a line.
+        :param boxes_flow:
+            This parameter determines the importance of horizontal and vertical position when
+            determining the order of text boxes. A value between -1.0 and +1.0 can be set,
+            with -1.0 indicating that only horizontal position matters and +1.0 indicating
+            that only vertical position matters. Setting the value to 'None' will disable advanced
+            layout analysis, and text boxes will be ordered based on the position of their bottom left corner.
+        :param detect_vertical:
+            This parameter determines whether vertical text should be considered during layout analysis.
+        :param all_texts:
+            If layout analysis should be performed on text in figures.
+        """
+
+        pdfminer_import.check()
+
+        self.layout_params = LAParams(
+            line_overlap=line_overlap,
+            char_margin=char_margin,
+            line_margin=line_margin,
+            word_margin=word_margin,
+            boxes_flow=boxes_flow,
+            detect_vertical=detect_vertical,
+            all_texts=all_texts,
+        )
+
+    def __converter(self, extractor) -> Document:
+        """
+        Extracts text from PDF pages then convert the text into Documents
+
+        :param extractor:
+            Python generator that yields PDF pages.
+
+        :returns:
+            PDF text converted to Haystack Document
+        """
+        pages = []
+        for page in extractor:
+            text = ""
+            for container in page:
+                # Keep text only
+                if isinstance(container, LTTextContainer):
+                    text += container.get_text()
+            pages.append(text)
+
+        # Add a page delimiter
+        concat = "\f".join(pages)
+
+        return Document(content=concat)
+
+    @component.output_types(document=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts PDF files to Documents.
+
+        :param sources:
+            List of PDF file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
+        """
+        documents = []
+
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
+                document = self.__converter(pdf_reader)
+            except Exception as e:
+                logger.warning(
+                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+            document.meta = merged_metadata
+            documents.append(document)
+
+        return {"documents": documents}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -109,6 +109,7 @@ extra-dependencies = [

  # Converters
  "pypdf",  # PyPDFConverter
+  "pdfminer.six", # PDFMinerToDocument
  "markdown-it-py",  # MarkdownToDocument
  "mdit_plain",  # MarkdownToDocument
  "tika",  # TikaDocumentConverter
--- a/releasenotes/notes/add-pdfminer-converter-f08f68e38ef82f4a.yaml
+++ b/releasenotes/notes/add-pdfminer-converter-f08f68e38ef82f4a.yaml
@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+   Provides users the ability to customize text extraction from PDF files. It is particularly useful for PDFs with unusual layouts, such as those containing multiple text columns. For instance, users can configure the object to retain the reading order.
--- a/test/components/converters/test_pdfminer_to_document.py
+++ b/test/components/converters/test_pdfminer_to_document.py
@ -0,0 +1,125 @@
+import logging
+
+import pytest
+
+from haystack.dataclasses import ByteStream
+from haystack.components.converters.pdfminer import PDFMinerToDocument
+
+
+class TestPDFMinerToDocument:
+    def test_run(self, test_files_path):
+        """
+        Test if the component runs correctly.
+        """
+        converter = PDFMinerToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
+        results = converter.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        for doc in docs:
+            assert "the page 3 is empty" in doc.content
+            assert "Page 4 of Sample PDF" in doc.content
+
+    def test_init_params_custom(self, test_files_path):
+        """
+        Test if init arguments are passed successfully to PDFMinerToDocument layout parameters
+        """
+        converter = PDFMinerToDocument(char_margin=0.5, all_texts=True)
+        assert converter.layout_params.char_margin == 0.5
+        assert converter.layout_params.all_texts is True
+
+    def test_run_wrong_file_type(self, test_files_path, caplog):
+        """
+        Test if the component runs correctly when an input file is not of the expected type.
+        """
+        sources = [test_files_path / "audio" / "answer.wav"]
+        converter = PDFMinerToDocument()
+
+        with caplog.at_level(logging.WARNING):
+            output = converter.run(sources=sources)
+            assert "Is this really a PDF?" in caplog.text
+
+        docs = output["documents"]
+        assert not docs
+
+    def test_arg_is_none(self, test_files_path):
+        """
+        Test if the component runs correctly when an argument is None.
+        """
+        converter = PDFMinerToDocument(char_margin=None)
+        assert converter.layout_params.char_margin is None
+
+    def test_run_doc_metadata(self, test_files_path):
+        """
+        Test if the component runs correctly when metadata is supplied by the user.
+        """
+        converter = PDFMinerToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
+        metadata = [{"file_name": "sample_pdf_2.pdf"}]
+        results = converter.run(sources=sources, meta=metadata)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Ward Cunningham" in docs[0].content
+        assert docs[0].meta["file_name"] == "sample_pdf_2.pdf"
+
+    def test_incorrect_meta(self, test_files_path):
+        """
+        Test if the component raises an error when incorrect metadata is supplied by the user.
+        """
+        converter = PDFMinerToDocument()
+        sources = [test_files_path / "pdf" / "sample_pdf_3.pdf"]
+        metadata = [{"file_name": "sample_pdf_3.pdf"}, {"file_name": "sample_pdf_2.pdf"}]
+        with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
+            converter.run(sources=sources, meta=metadata)
+
+    def test_run_bytestream_metadata(self, test_files_path):
+        """
+        Test if the component runs correctly when metadata is read from the ByteStream object.
+        """
+        converter = PDFMinerToDocument()
+        with open(test_files_path / "pdf" / "sample_pdf_2.pdf", "rb") as file:
+            byte_stream = file.read()
+            stream = ByteStream(byte_stream, meta={"content_type": "text/pdf", "url": "test_url"})
+
+        results = converter.run(sources=[stream])
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Ward Cunningham" in docs[0].content
+        assert docs[0].meta == {"content_type": "text/pdf", "url": "test_url"}
+
+    def test_run_bytestream_doc_overlapping_metadata(self, test_files_path):
+        """
+        Test if the component runs correctly when metadata is read from the ByteStream object and supplied by the user.
+
+        There is an overlap between the metadata received.
+
+        The component should use the supplied metadata to overwrite the values if there is an overlap between the keys.
+        """
+        converter = PDFMinerToDocument()
+        with open(test_files_path / "pdf" / "sample_pdf_2.pdf", "rb") as file:
+            byte_stream = file.read()
+            # ByteStream has "url" present in metadata
+            stream = ByteStream(byte_stream, meta={"content_type": "text/pdf", "url": "test_url_correct"})
+
+        # "url" supplied by the user overwrites value present in metadata
+        metadata = [{"file_name": "sample_pdf_2.pdf", "url": "test_url_new"}]
+        results = converter.run(sources=[stream], meta=metadata)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Ward Cunningham" in docs[0].content
+        assert docs[0].meta == {"file_name": "sample_pdf_2.pdf", "content_type": "text/pdf", "url": "test_url_new"}
+
+    def test_run_error_handling(self, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        sources = ["non_existing_file.pdf"]
+        converter = PDFMinerToDocument()
+        with caplog.at_level(logging.WARNING):
+            results = converter.run(sources=sources)
+            assert "Could not read non_existing_file.pdf" in caplog.text
+            assert results["documents"] == []