feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)

* adding function to detect unmapped CID characters * adding release notes * adding test for logs
2025-12-13 15:57:24 +00:00 · 2025-03-06 16:44:06 +01:00 · 2025-03-06 16:44:06 +01:00 · c037052581
commit c037052581
parent 4c9d08add5
3 changed files with 102 additions and 0 deletions
--- a/haystack/components/converters/pdfminer.py
+++ b/haystack/components/converters/pdfminer.py
@ -4,6 +4,7 @@

 import io
 import os
+import re
 from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Union

@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:

 logger = logging.getLogger(__name__)

+CID_PATTERN = r"\(cid:\d+\)"  # regex pattern to detect CID characters
+

@component
 class PDFMinerToDocument:
@ -97,6 +100,7 @@ class PDFMinerToDocument:
            all_texts=all_texts,
        )
        self.store_full_path = store_full_path
+        self.cid_pattern = re.compile(CID_PATTERN)

    @staticmethod
    def _converter(lt_page_objs: Iterator) -> str:
@ -126,6 +130,32 @@ class PDFMinerToDocument:

        return delimited_pages

+    def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
+        """
+        Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
+
+        This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
+        non-standard fonts.
+
+        A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
+        searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
+        needs. If that map is not available the text extractor cannot decode the CID characters and will return them
+        as is.
+
+        see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
+
+        :param: text: The text to check for undecoded CID characters
+        :returns:
+            A dictionary containing detection results
+        """
+
+        matches = re.findall(self.cid_pattern, text)
+        total_chars = len(text)
+        cid_chars = sum(len(match) for match in matches)
+        percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
+
+        return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
+
    @component.output_types(documents=List[Document])
    def run(
        self,
@ -178,6 +208,19 @@ class PDFMinerToDocument:

            if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                merged_metadata["file_path"] = os.path.basename(file_path)
+
+            analysis = self.detect_undecoded_cid_characters(text)
+
+            if analysis["percentage"] > 0:
+                logger.warning(
+                    "Detected {cid_chars} undecoded CID characters in {total_chars} characters"
+                    " ({percentage}%) in {source}.",
+                    cid_chars=analysis["cid_chars"],
+                    total_chars=analysis["total_chars"],
+                    percentage=analysis["percentage"],
+                    source=source,
+                )
+
            document = Document(content=text, meta=merged_metadata)
            documents.append(document)

--- a/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml
+++ b/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml
@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
+    text extraction quality issues when processing PDFs with non-standard fonts.
--- a/test/components/converters/test_pdfminer_to_document.py
+++ b/test/components/converters/test_pdfminer_to_document.py
@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from unittest.mock import patch

 import pytest

@ -185,3 +186,56 @@ class TestPDFMinerToDocument:
            "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
        )
        assert docs["documents"][6].content == expected
+
+    def test_detect_undecoded_cid_characters(self):
+        """
+        Test if the component correctly detects and reports undecoded CID characters in text.
+        """
+        converter = PDFMinerToDocument()
+
+        # Test text with no CID characters
+        text = "This is a normal text without any CID characters."
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == 0
+        assert result["percentage"] == 0
+
+        # Test text with CID characters
+        text = "Some text with (cid:123) and (cid:456) characters"
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)")  # 18 characters total
+        assert result["percentage"] == round((18 / len(text)) * 100, 2)
+
+        # Test text with multiple consecutive CID characters
+        text = "(cid:123)(cid:456)(cid:789)"
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == len(text)
+        assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
+        assert result["percentage"] == 100.0
+
+        # Test empty text
+        text = ""
+        result = converter.detect_undecoded_cid_characters(text)
+        assert result["total_chars"] == 0
+        assert result["cid_chars"] == 0
+        assert result["percentage"] == 0
+
+    def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
+        """
+        Test if the component correctly logs a warning when undecoded CID characters are detected.
+        """
+        test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
+
+        def mock_converter(*args, **kwargs):
+            return "This is text with (cid:123) and (cid:456) characters"
+
+        def mock_extract_pages(*args, **kwargs):
+            return ["mocked page"]
+
+        with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
+            with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
+                with caplog.at_level(logging.WARNING):
+                    converter = PDFMinerToDocument()
+                    converter.run(sources=[test_data])
+                    assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text