mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-13 15:57:24 +00:00
feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)
* adding function to detect unmapped CID characters * adding release notes * adding test for logs
This commit is contained in:
parent
4c9d08add5
commit
c037052581
@ -4,6 +4,7 @@
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
|
||||
@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CID_PATTERN = r"\(cid:\d+\)" # regex pattern to detect CID characters
|
||||
|
||||
|
||||
@component
|
||||
class PDFMinerToDocument:
|
||||
@ -97,6 +100,7 @@ class PDFMinerToDocument:
|
||||
all_texts=all_texts,
|
||||
)
|
||||
self.store_full_path = store_full_path
|
||||
self.cid_pattern = re.compile(CID_PATTERN)
|
||||
|
||||
@staticmethod
|
||||
def _converter(lt_page_objs: Iterator) -> str:
|
||||
@ -126,6 +130,32 @@ class PDFMinerToDocument:
|
||||
|
||||
return delimited_pages
|
||||
|
||||
def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
|
||||
|
||||
This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
|
||||
non-standard fonts.
|
||||
|
||||
A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
|
||||
searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
|
||||
needs. If that map is not available the text extractor cannot decode the CID characters and will return them
|
||||
as is.
|
||||
|
||||
see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
|
||||
|
||||
:param: text: The text to check for undecoded CID characters
|
||||
:returns:
|
||||
A dictionary containing detection results
|
||||
"""
|
||||
|
||||
matches = re.findall(self.cid_pattern, text)
|
||||
total_chars = len(text)
|
||||
cid_chars = sum(len(match) for match in matches)
|
||||
percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
|
||||
|
||||
return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(
|
||||
self,
|
||||
@ -178,6 +208,19 @@ class PDFMinerToDocument:
|
||||
|
||||
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
|
||||
analysis = self.detect_undecoded_cid_characters(text)
|
||||
|
||||
if analysis["percentage"] > 0:
|
||||
logger.warning(
|
||||
"Detected {cid_chars} undecoded CID characters in {total_chars} characters"
|
||||
" ({percentage}%) in {source}.",
|
||||
cid_chars=analysis["cid_chars"],
|
||||
total_chars=analysis["total_chars"],
|
||||
percentage=analysis["percentage"],
|
||||
source=source,
|
||||
)
|
||||
|
||||
document = Document(content=text, meta=merged_metadata)
|
||||
documents.append(document)
|
||||
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
|
||||
text extraction quality issues when processing PDFs with non-standard fonts.
|
||||
@ -2,6 +2,7 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import logging
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
@ -185,3 +186,56 @@ class TestPDFMinerToDocument:
|
||||
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
|
||||
)
|
||||
assert docs["documents"][6].content == expected
|
||||
|
||||
def test_detect_undecoded_cid_characters(self):
|
||||
"""
|
||||
Test if the component correctly detects and reports undecoded CID characters in text.
|
||||
"""
|
||||
converter = PDFMinerToDocument()
|
||||
|
||||
# Test text with no CID characters
|
||||
text = "This is a normal text without any CID characters."
|
||||
result = converter.detect_undecoded_cid_characters(text)
|
||||
assert result["total_chars"] == len(text)
|
||||
assert result["cid_chars"] == 0
|
||||
assert result["percentage"] == 0
|
||||
|
||||
# Test text with CID characters
|
||||
text = "Some text with (cid:123) and (cid:456) characters"
|
||||
result = converter.detect_undecoded_cid_characters(text)
|
||||
assert result["total_chars"] == len(text)
|
||||
assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total
|
||||
assert result["percentage"] == round((18 / len(text)) * 100, 2)
|
||||
|
||||
# Test text with multiple consecutive CID characters
|
||||
text = "(cid:123)(cid:456)(cid:789)"
|
||||
result = converter.detect_undecoded_cid_characters(text)
|
||||
assert result["total_chars"] == len(text)
|
||||
assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
|
||||
assert result["percentage"] == 100.0
|
||||
|
||||
# Test empty text
|
||||
text = ""
|
||||
result = converter.detect_undecoded_cid_characters(text)
|
||||
assert result["total_chars"] == 0
|
||||
assert result["cid_chars"] == 0
|
||||
assert result["percentage"] == 0
|
||||
|
||||
def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
|
||||
"""
|
||||
Test if the component correctly logs a warning when undecoded CID characters are detected.
|
||||
"""
|
||||
test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
|
||||
|
||||
def mock_converter(*args, **kwargs):
|
||||
return "This is text with (cid:123) and (cid:456) characters"
|
||||
|
||||
def mock_extract_pages(*args, **kwargs):
|
||||
return ["mocked page"]
|
||||
|
||||
with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
|
||||
with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
|
||||
with caplog.at_level(logging.WARNING):
|
||||
converter = PDFMinerToDocument()
|
||||
converter.run(sources=[test_data])
|
||||
assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user