feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)

* adding function to detect unmapped CID characters

* adding release notes

* adding test for logs
This commit is contained in:
David S. Batista 2025-03-06 16:44:06 +01:00 committed by GitHub
parent 4c9d08add5
commit c037052581
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 102 additions and 0 deletions

View File

@ -4,6 +4,7 @@
import io
import os
import re
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Union
@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
logger = logging.getLogger(__name__)
CID_PATTERN = r"\(cid:\d+\)" # regex pattern to detect CID characters
@component
class PDFMinerToDocument:
@ -97,6 +100,7 @@ class PDFMinerToDocument:
all_texts=all_texts,
)
self.store_full_path = store_full_path
self.cid_pattern = re.compile(CID_PATTERN)
@staticmethod
def _converter(lt_page_objs: Iterator) -> str:
@ -126,6 +130,32 @@ class PDFMinerToDocument:
return delimited_pages
def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
"""
Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
non-standard fonts.
A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
needs. If that map is not available the text extractor cannot decode the CID characters and will return them
as is.
see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
:param: text: The text to check for undecoded CID characters
:returns:
A dictionary containing detection results
"""
matches = re.findall(self.cid_pattern, text)
total_chars = len(text)
cid_chars = sum(len(match) for match in matches)
percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
@component.output_types(documents=List[Document])
def run(
self,
@ -178,6 +208,19 @@ class PDFMinerToDocument:
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
analysis = self.detect_undecoded_cid_characters(text)
if analysis["percentage"] > 0:
logger.warning(
"Detected {cid_chars} undecoded CID characters in {total_chars} characters"
" ({percentage}%) in {source}.",
cid_chars=analysis["cid_chars"],
total_chars=analysis["total_chars"],
percentage=analysis["percentage"],
source=source,
)
document = Document(content=text, meta=merged_metadata)
documents.append(document)

View File

@ -0,0 +1,5 @@
---
enhancements:
- |
Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
text extraction quality issues when processing PDFs with non-standard fonts.

View File

@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0
import logging
from unittest.mock import patch
import pytest
@ -185,3 +186,56 @@ class TestPDFMinerToDocument:
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
)
assert docs["documents"][6].content == expected
def test_detect_undecoded_cid_characters(self):
"""
Test if the component correctly detects and reports undecoded CID characters in text.
"""
converter = PDFMinerToDocument()
# Test text with no CID characters
text = "This is a normal text without any CID characters."
result = converter.detect_undecoded_cid_characters(text)
assert result["total_chars"] == len(text)
assert result["cid_chars"] == 0
assert result["percentage"] == 0
# Test text with CID characters
text = "Some text with (cid:123) and (cid:456) characters"
result = converter.detect_undecoded_cid_characters(text)
assert result["total_chars"] == len(text)
assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total
assert result["percentage"] == round((18 / len(text)) * 100, 2)
# Test text with multiple consecutive CID characters
text = "(cid:123)(cid:456)(cid:789)"
result = converter.detect_undecoded_cid_characters(text)
assert result["total_chars"] == len(text)
assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
assert result["percentage"] == 100.0
# Test empty text
text = ""
result = converter.detect_undecoded_cid_characters(text)
assert result["total_chars"] == 0
assert result["cid_chars"] == 0
assert result["percentage"] == 0
def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
"""
Test if the component correctly logs a warning when undecoded CID characters are detected.
"""
test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
def mock_converter(*args, **kwargs):
return "This is text with (cid:123) and (cid:456) characters"
def mock_extract_pages(*args, **kwargs):
return ["mocked page"]
with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
with caplog.at_level(logging.WARNING):
converter = PDFMinerToDocument()
converter.run(sources=[test_data])
assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text