mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 07:59:27 +00:00
feat: Add store_full_path to converters (3/3) (#8585)
* Add store_full_path params
This commit is contained in:
parent
de7099e560
commit
4c8eb54049
@ -4,6 +4,8 @@
|
||||
|
||||
import copy
|
||||
import hashlib
|
||||
import os
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
@ -49,7 +51,7 @@ class AzureOCRDocumentConverter:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
def __init__( # pylint: disable=too-many-positional-arguments
|
||||
self,
|
||||
endpoint: str,
|
||||
api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
|
||||
@ -59,6 +61,7 @@ class AzureOCRDocumentConverter:
|
||||
merge_multiple_column_headers: bool = True,
|
||||
page_layout: Literal["natural", "single_column"] = "natural",
|
||||
threshold_y: Optional[float] = 0.05,
|
||||
store_full_path: bool = True,
|
||||
):
|
||||
"""
|
||||
Creates an AzureOCRDocumentConverter component.
|
||||
@ -83,6 +86,9 @@ class AzureOCRDocumentConverter:
|
||||
The threshold, in inches, to determine if two recognized PDF elements are grouped into a
|
||||
single line. This is crucial for section headers or numbers which may be spatially separated
|
||||
from the remaining text on the horizontal axis.
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
azure_import.check()
|
||||
|
||||
@ -97,6 +103,7 @@ class AzureOCRDocumentConverter:
|
||||
self.merge_multiple_column_headers = merge_multiple_column_headers
|
||||
self.page_layout = page_layout
|
||||
self.threshold_y = threshold_y
|
||||
self.store_full_path = store_full_path
|
||||
if self.page_layout == "single_column" and self.threshold_y is None:
|
||||
self.threshold_y = 0.05
|
||||
|
||||
@ -136,6 +143,15 @@ class AzureOCRDocumentConverter:
|
||||
azure_output.append(result.to_dict())
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
warnings.warn(
|
||||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
|
||||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
|
||||
"storing only file names to improve privacy.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
docs = self._convert_tables_and_text(result=result, meta=merged_metadata)
|
||||
documents.extend(docs)
|
||||
|
||||
@ -158,6 +174,7 @@ class AzureOCRDocumentConverter:
|
||||
merge_multiple_column_headers=self.merge_multiple_column_headers,
|
||||
page_layout=self.page_layout,
|
||||
threshold_y=self.threshold_y,
|
||||
store_full_path=self.store_full_path,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
import os
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
@ -99,6 +100,7 @@ class PyPDFToDocument:
|
||||
layout_mode_scale_weight: float = 1.25,
|
||||
layout_mode_strip_rotated: bool = True,
|
||||
layout_mode_font_height_weight: float = 1.0,
|
||||
store_full_path: bool = True,
|
||||
):
|
||||
"""
|
||||
Create an PyPDFToDocument component.
|
||||
@ -131,6 +133,9 @@ class PyPDFToDocument:
|
||||
:param layout_mode_font_height_weight:
|
||||
Multiplier for font height when calculating blank line height.
|
||||
Ignored if `extraction_mode` is `PyPDFExtractionMode.PLAIN`.
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
pypdf_import.check()
|
||||
|
||||
@ -142,6 +147,7 @@ class PyPDFToDocument:
|
||||
warnings.warn(msg, DeprecationWarning)
|
||||
|
||||
self.converter = converter
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
if isinstance(extraction_mode, str):
|
||||
extraction_mode = PyPDFExtractionMode.from_str(extraction_mode)
|
||||
@ -170,6 +176,7 @@ class PyPDFToDocument:
|
||||
layout_mode_scale_weight=self.layout_mode_scale_weight,
|
||||
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
|
||||
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
|
||||
store_full_path=self.store_full_path,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -255,6 +262,14 @@ class PyPDFToDocument:
|
||||
)
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
warnings.warn(
|
||||
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
|
||||
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
|
||||
"storing only file names to improve privacy.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
|
||||
merged_metadata["file_path"] = os.path.basename(file_path)
|
||||
document.meta = merged_metadata
|
||||
documents.append(document)
|
||||
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Added a new `store_full_path` parameter to the `__init__` methods of `PyPDFToDocument` and `AzureOCRDocumentConverter`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
|
||||
|
||||
deprecations:
|
||||
- |
|
||||
The default value of the `store_full_path` parameter in `PyPDFToDocument` and `AzureOCRDocumentConverter` will change to `False` in Haysatck 2.9.0 to enhance privacy.
|
||||
@ -105,6 +105,7 @@ class TestAzureOCRDocumentConverter:
|
||||
"page_layout": "natural",
|
||||
"preceding_context_len": 3,
|
||||
"threshold_y": 0.05,
|
||||
"store_full_path": True,
|
||||
},
|
||||
}
|
||||
|
||||
@ -278,6 +279,9 @@ class TestAzureOCRDocumentConverter:
|
||||
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
|
||||
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
||||
def test_run_with_docx_file(self, test_files_path):
|
||||
"""
|
||||
Test if the component runs correctly with store_full_path=False
|
||||
"""
|
||||
component = AzureOCRDocumentConverter(
|
||||
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
|
||||
)
|
||||
@ -288,6 +292,21 @@ class TestAzureOCRDocumentConverter:
|
||||
assert "Now we are in Page 2" in documents[0].content
|
||||
assert "Page 3 was empty this is page 4" in documents[0].content
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
|
||||
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
||||
def test_run_with_store_full_path_false(self, test_files_path):
|
||||
component = AzureOCRDocumentConverter(
|
||||
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"],
|
||||
api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY"),
|
||||
store_full_path=False,
|
||||
)
|
||||
output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 1
|
||||
assert "Sample Docx File" in documents[0].content
|
||||
assert documents[0].meta["file_path"] == "sample_docx.docx"
|
||||
|
||||
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
|
||||
def test_hashing_dataframe(self, mock_resolve_value):
|
||||
mock_resolve_value.return_value = "test_api_key"
|
||||
|
||||
@ -80,11 +80,12 @@ class TestPyPDFToDocument:
|
||||
"layout_mode_scale_weight": 1.25,
|
||||
"layout_mode_strip_rotated": True,
|
||||
"layout_mode_font_height_weight": 1.0,
|
||||
"store_full_path": True,
|
||||
},
|
||||
}
|
||||
|
||||
def test_to_dict_custom_converter(self):
|
||||
pypdf_component = PyPDFToDocument(converter=CustomConverter())
|
||||
pypdf_component = PyPDFToDocument(converter=CustomConverter(), store_full_path=False)
|
||||
data = pypdf_component.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||||
@ -100,6 +101,7 @@ class TestPyPDFToDocument:
|
||||
"layout_mode_scale_weight": 1.25,
|
||||
"layout_mode_strip_rotated": True,
|
||||
"layout_mode_font_height_weight": 1.0,
|
||||
"store_full_path": False,
|
||||
},
|
||||
}
|
||||
|
||||
@ -214,6 +216,25 @@ class TestPyPDFToDocument:
|
||||
assert output["documents"][0].meta["language"] == "it"
|
||||
assert output["documents"][1].meta["language"] == "it"
|
||||
|
||||
def test_run_with_store_full_path_false(self, test_files_path):
|
||||
"""
|
||||
Test if the component runs correctly with store_full_path=False
|
||||
"""
|
||||
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
|
||||
converter = PyPDFToDocument(store_full_path=True)
|
||||
results = converter.run(sources=sources)
|
||||
docs = results["documents"]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].meta["file_path"] == str(sources[0])
|
||||
|
||||
converter = PyPDFToDocument(store_full_path=False)
|
||||
results = converter.run(sources=sources)
|
||||
docs = results["documents"]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].meta["file_path"] == "sample_pdf_1.pdf"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, pypdf_component, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user