feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)

* adding function to detect unmapped CID characters * adding release notes * adding test for logs
2025-12-16 09:38:07 +00:00 · 2025-03-06 16:44:06 +01:00 · 2025-03-06 16:44:06 +01:00 · c037052581
commit c037052581
parent 4c9d08add5
3 changed files with 102 additions and 0 deletions
--- a/haystack/components/converters/pdfminer.py
+++ b/haystack/components/converters/pdfminer.py
@ -4,6 +4,7 @@
 import io
 import os
 import re
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Union
@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
 logger = logging.getLogger(__name__)
 CID_PATTERN = r"\(cid:\d+\)"  # regex pattern to detect CID characters
@component
 class PDFMinerToDocument:
@ -97,6 +100,7 @@ class PDFMinerToDocument:
            all_texts=all_texts,
        )
        self.store_full_path = store_full_path
        self.cid_pattern = re.compile(CID_PATTERN)
    @staticmethod
    def _converter(lt_page_objs: Iterator) -> str:
@ -126,6 +130,32 @@ class PDFMinerToDocument:
        return delimited_pages
    def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
        """
        Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
        This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
        non-standard fonts.
        A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
        searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
        needs. If that map is not available the text extractor cannot decode the CID characters and will return them
        as is.
        see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
        :param: text: The text to check for undecoded CID characters
        :returns:
            A dictionary containing detection results
        """
        matches = re.findall(self.cid_pattern, text)
        total_chars = len(text)
        cid_chars = sum(len(match) for match in matches)
        percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
        return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
    @component.output_types(documents=List[Document])
    def run(
        self,
@ -178,6 +208,19 @@ class PDFMinerToDocument:
            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)
            analysis = self.detect_undecoded_cid_characters(text)
            if analysis["percentage"] > 0:
                logger.warning(
                    "Detected {cid_chars} undecoded CID characters in {total_chars} characters"
                    " ({percentage}%) in {source}.",
                    cid_chars=analysis["cid_chars"],
                    total_chars=analysis["total_chars"],
                    percentage=analysis["percentage"],
                    source=source,
                )
            document = Document(content=text, meta=merged_metadata)
            documents.append(document)
--- a/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml
+++ b/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml
@ -0,0 +1,5 @@
 ---
 enhancements:
  - |
    Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
    text extraction quality issues when processing PDFs with non-standard fonts.
--- a/test/components/converters/test_pdfminer_to_document.py
+++ b/test/components/converters/test_pdfminer_to_document.py
@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
 from unittest.mock import patch
 import pytest
@ -185,3 +186,56 @@ class TestPDFMinerToDocument:
            "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
        )
        assert docs["documents"][6].content == expected
    def test_detect_undecoded_cid_characters(self):
        """
        Test if the component correctly detects and reports undecoded CID characters in text.
        """
        converter = PDFMinerToDocument()
        # Test text with no CID characters
        text = "This is a normal text without any CID characters."
        result = converter.detect_undecoded_cid_characters(text)
        assert result["total_chars"] == len(text)
        assert result["cid_chars"] == 0
        assert result["percentage"] == 0
        # Test text with CID characters
        text = "Some text with (cid:123) and (cid:456) characters"
        result = converter.detect_undecoded_cid_characters(text)
        assert result["total_chars"] == len(text)
        assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)")  # 18 characters total
        assert result["percentage"] == round((18 / len(text)) * 100, 2)
        # Test text with multiple consecutive CID characters
        text = "(cid:123)(cid:456)(cid:789)"
        result = converter.detect_undecoded_cid_characters(text)
        assert result["total_chars"] == len(text)
        assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
        assert result["percentage"] == 100.0
        # Test empty text
        text = ""
        result = converter.detect_undecoded_cid_characters(text)
        assert result["total_chars"] == 0
        assert result["cid_chars"] == 0
        assert result["percentage"] == 0
    def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
        """
        Test if the component correctly logs a warning when undecoded CID characters are detected.
        """
        test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
        def mock_converter(*args, **kwargs):
            return "This is text with (cid:123) and (cid:456) characters"
        def mock_extract_pages(*args, **kwargs):
            return ["mocked page"]
        with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
            with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
                with caplog.at_level(logging.WARNING):
                    converter = PDFMinerToDocument()
                    converter.run(sources=[test_data])
                    assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text