mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-16 09:38:07 +00:00
feat: adding function to detect unmapped CID characters in PDFMinerToDocument (#8992)
* adding function to detect unmapped CID characters * adding release notes * adding test for logs
This commit is contained in:
parent
4c9d08add5
commit
c037052581
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||||
|
|
||||||
@ -18,6 +19,8 @@ with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CID_PATTERN = r"\(cid:\d+\)" # regex pattern to detect CID characters
|
||||||
|
|
||||||
|
|
||||||
@component
|
@component
|
||||||
class PDFMinerToDocument:
|
class PDFMinerToDocument:
|
||||||
@ -97,6 +100,7 @@ class PDFMinerToDocument:
|
|||||||
all_texts=all_texts,
|
all_texts=all_texts,
|
||||||
)
|
)
|
||||||
self.store_full_path = store_full_path
|
self.store_full_path = store_full_path
|
||||||
|
self.cid_pattern = re.compile(CID_PATTERN)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _converter(lt_page_objs: Iterator) -> str:
|
def _converter(lt_page_objs: Iterator) -> str:
|
||||||
@ -126,6 +130,32 @@ class PDFMinerToDocument:
|
|||||||
|
|
||||||
return delimited_pages
|
return delimited_pages
|
||||||
|
|
||||||
|
def detect_undecoded_cid_characters(self, text: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Look for character sequences of CID, i.e.: characters that haven't been properly decoded from their CID format.
|
||||||
|
|
||||||
|
This is useful to detect if the text extractor is not able to extract the text correctly, e.g. if the PDF uses
|
||||||
|
non-standard fonts.
|
||||||
|
|
||||||
|
A PDF font may include a ToUnicode map (mapping from character code to Unicode) to support operations like
|
||||||
|
searching strings or copy & paste in a PDF viewer. This map immediately provides the mapping the text extractor
|
||||||
|
needs. If that map is not available the text extractor cannot decode the CID characters and will return them
|
||||||
|
as is.
|
||||||
|
|
||||||
|
see: https://pdfminersix.readthedocs.io/en/latest/faq.html#why-are-there-cid-x-values-in-the-textual-output
|
||||||
|
|
||||||
|
:param: text: The text to check for undecoded CID characters
|
||||||
|
:returns:
|
||||||
|
A dictionary containing detection results
|
||||||
|
"""
|
||||||
|
|
||||||
|
matches = re.findall(self.cid_pattern, text)
|
||||||
|
total_chars = len(text)
|
||||||
|
cid_chars = sum(len(match) for match in matches)
|
||||||
|
percentage = (cid_chars / total_chars * 100) if total_chars > 0 else 0
|
||||||
|
|
||||||
|
return {"total_chars": total_chars, "cid_chars": cid_chars, "percentage": round(percentage, 2)}
|
||||||
|
|
||||||
@component.output_types(documents=List[Document])
|
@component.output_types(documents=List[Document])
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
@ -178,6 +208,19 @@ class PDFMinerToDocument:
|
|||||||
|
|
||||||
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
||||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||||
|
|
||||||
|
analysis = self.detect_undecoded_cid_characters(text)
|
||||||
|
|
||||||
|
if analysis["percentage"] > 0:
|
||||||
|
logger.warning(
|
||||||
|
"Detected {cid_chars} undecoded CID characters in {total_chars} characters"
|
||||||
|
" ({percentage}%) in {source}.",
|
||||||
|
cid_chars=analysis["cid_chars"],
|
||||||
|
total_chars=analysis["total_chars"],
|
||||||
|
percentage=analysis["percentage"],
|
||||||
|
source=source,
|
||||||
|
)
|
||||||
|
|
||||||
document = Document(content=text, meta=merged_metadata)
|
document = Document(content=text, meta=merged_metadata)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Added `PDFMinerToDocument` functionality to detect and report undecoded CID characters in PDF text extraction, helping users identify potential
|
||||||
|
text extraction quality issues when processing PDFs with non-standard fonts.
|
||||||
@ -2,6 +2,7 @@
|
|||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
import logging
|
import logging
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -185,3 +186,56 @@ class TestPDFMinerToDocument:
|
|||||||
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
|
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
|
||||||
)
|
)
|
||||||
assert docs["documents"][6].content == expected
|
assert docs["documents"][6].content == expected
|
||||||
|
|
||||||
|
def test_detect_undecoded_cid_characters(self):
|
||||||
|
"""
|
||||||
|
Test if the component correctly detects and reports undecoded CID characters in text.
|
||||||
|
"""
|
||||||
|
converter = PDFMinerToDocument()
|
||||||
|
|
||||||
|
# Test text with no CID characters
|
||||||
|
text = "This is a normal text without any CID characters."
|
||||||
|
result = converter.detect_undecoded_cid_characters(text)
|
||||||
|
assert result["total_chars"] == len(text)
|
||||||
|
assert result["cid_chars"] == 0
|
||||||
|
assert result["percentage"] == 0
|
||||||
|
|
||||||
|
# Test text with CID characters
|
||||||
|
text = "Some text with (cid:123) and (cid:456) characters"
|
||||||
|
result = converter.detect_undecoded_cid_characters(text)
|
||||||
|
assert result["total_chars"] == len(text)
|
||||||
|
assert result["cid_chars"] == len("(cid:123)") + len("(cid:456)") # 18 characters total
|
||||||
|
assert result["percentage"] == round((18 / len(text)) * 100, 2)
|
||||||
|
|
||||||
|
# Test text with multiple consecutive CID characters
|
||||||
|
text = "(cid:123)(cid:456)(cid:789)"
|
||||||
|
result = converter.detect_undecoded_cid_characters(text)
|
||||||
|
assert result["total_chars"] == len(text)
|
||||||
|
assert result["cid_chars"] == len("(cid:123)(cid:456)(cid:789)")
|
||||||
|
assert result["percentage"] == 100.0
|
||||||
|
|
||||||
|
# Test empty text
|
||||||
|
text = ""
|
||||||
|
result = converter.detect_undecoded_cid_characters(text)
|
||||||
|
assert result["total_chars"] == 0
|
||||||
|
assert result["cid_chars"] == 0
|
||||||
|
assert result["percentage"] == 0
|
||||||
|
|
||||||
|
def test_pdfminer_logs_warning_for_cid_characters(self, caplog, monkeypatch):
|
||||||
|
"""
|
||||||
|
Test if the component correctly logs a warning when undecoded CID characters are detected.
|
||||||
|
"""
|
||||||
|
test_data = ByteStream(data=b"fake", meta={"file_path": "test.pdf"})
|
||||||
|
|
||||||
|
def mock_converter(*args, **kwargs):
|
||||||
|
return "This is text with (cid:123) and (cid:456) characters"
|
||||||
|
|
||||||
|
def mock_extract_pages(*args, **kwargs):
|
||||||
|
return ["mocked page"]
|
||||||
|
|
||||||
|
with patch("haystack.components.converters.pdfminer.extract_pages", side_effect=mock_extract_pages):
|
||||||
|
with patch.object(PDFMinerToDocument, "_converter", side_effect=mock_converter):
|
||||||
|
with caplog.at_level(logging.WARNING):
|
||||||
|
converter = PDFMinerToDocument()
|
||||||
|
converter.run(sources=[test_data])
|
||||||
|
assert "Detected 18 undecoded CID characters in 52 characters (34.62%)" in caplog.text
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user