diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index 6c8fc6cdc..1ba6137b5 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -4,6 +4,7 @@ import io import os +import re from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Union @@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import: logger = logging.getLogger(__name__) +CID_PATTERN = r"\(cid:\d+\)" # regex pattern to detect CID characters + @component class PDFMinerToDocument: @@ -97,6 +100,7 @@ class PDFMinerToDocument: all_texts=all_texts, ) self.store_full_path = store_full_path + self.cid_pattern = re.compile(CID_PATTERN) @staticmethod def _converter(lt_page_objs: Iterator) -> str: @@ -126,6 +130,32 @@ class PDFMinerToDocument: return delimited_pages + def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]: + """ + Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format. + + This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses + non-standard fonts. + + A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like + searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor + needs. If that map is not available the text extractor cannot decode the CID characters and will return them + as is. + + see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output + + :param: text: The text to check for undecoded CID characters + :returns: + A dictionary containing detection results + """ + + matches = re.findall(self.cid_pattern, text) + total_chars = len(text) + cid_chars = sum(len(match) for match in matches) + percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0 + + return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)} + @component.output_types(documents=List[Document]) def run( self, @@ -178,6 +208,19 @@ class PDFMinerToDocument: if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) + + analysis = self.detect_undecoded_cid_characters(text) + + if analysis["percentage"] > 0: + logger.warning( + "Detected {cid_chars} undecoded CID characters in {total_chars} characters" + " ({percentage}%) in {source}.", + cid_chars=analysis["cid_chars"], + total_chars=analysis["total_chars"], + percentage=analysis["percentage"], + source=source, + ) + document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml b/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml new file mode 100644 index 000000000..2855d16bc --- /dev/null +++ b/releasenotes/notes/adding-CID-detection-PDFMinerToDocument-0195a929d64cd502.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential + text extraction quality issues when processing PDFs with non-standard fonts. diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py index 4691a2a1a..00d751efa 100644 --- a/test/components/converters/test_pdfminer_to_document.py +++ b/test/components/converters/test_pdfminer_to_document.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging +from unittest.mock import patch import pytest @@ -185,3 +186,56 @@ class TestPDFMinerToDocument: "structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n" ) assert docs["documents"][6].content == expected + + def test_detect_undecoded_cid_characters(self): + """ + Test if the component correctly detects and reports undecoded CID characters in text. + """ + converter = PDFMinerToDocument() + + # Test text with no CID characters + text = "This is a normal text without any CID characters." + result = converter.detect_undecoded_cid_characters(text) + assert result["total_chars"] == len(text) + assert result["cid_chars"] == 0 + assert result["percentage"] == 0 + + # Test text with CID characters + text = "Some text with (cid:123) and (cid:456) characters" + result = converter.detect_undecoded_cid_characters(text) + assert result["total_chars"] == len(text) + assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total + assert result["percentage"] == round((18 / len(text)) * 100, 2) + + # Test text with multiple consecutive CID characters + text = "(cid:123)(cid:456)(cid:789)" + result = converter.detect_undecoded_cid_characters(text) + assert result["total_chars"] == len(text) + assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)") + assert result["percentage"] == 100.0 + + # Test empty text + text = "" + result = converter.detect_undecoded_cid_characters(text) + assert result["total_chars"] == 0 + assert result["cid_chars"] == 0 + assert result["percentage"] == 0 + + def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch): + """ + Test if the component correctly logs a warning when undecoded CID characters are detected. + """ + test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"}) + + def mock_converter(*args, **kwargs): + return "This is text with (cid:123) and (cid:456) characters" + + def mock_extract_pages(*args, **kwargs): + return ["mocked page"] + + with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages): + with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter): + with caplog.at_level(logging.WARNING): + converter = PDFMinerToDocument() + converter.run(sources=[test_data]) + assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text