From 1168f6365d0e3eab543dec356fddefac7ea7d321 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Fri, 24 Jun 2022 09:55:09 +0200 Subject: [PATCH] Fix using id_hash_keys as pipeline params (#2717) * Fix using id_hash_keys as pipeline params * Update Documentation & Code Style * add tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/file_converter.md | 6 +++++- docs/_src/api/api/preprocessor.md | 2 +- haystack/nodes/file_converter/base.py | 20 ++++++++++++++++---- haystack/nodes/preprocessor/base.py | 5 +++++ test/nodes/test_file_converter.py | 16 ++++++++++++++++ test/nodes/test_preprocessor.py | 14 ++++++++++++++ 6 files changed, 57 insertions(+), 6 deletions(-) diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index f6ef56fc5..a4ac939d6 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -86,7 +86,7 @@ Validate if the language of the text is one of valid languages. #### BaseConverter.run ```python -def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8") +def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) ``` Extract text from a file. @@ -114,6 +114,10 @@ This option can be used to add test for encoding errors. If the extracted text i not one of the valid languages, then it might likely be encoding error resulting in garbled text. - `encoding`: Select the file encoding (default is `UTF-8`) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 852099be9..160b3e51a 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -16,7 +16,7 @@ class BasePreProcessor(BaseComponent) ```python @abstractmethod -def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[Document] +def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py index feaf66475..cd5e96378 100644 --- a/haystack/nodes/file_converter/base.py +++ b/haystack/nodes/file_converter/base.py @@ -137,6 +137,7 @@ class BaseConverter(BaseComponent): known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", + id_hash_keys: Optional[List[str]] = None, ): """ Extract text from a file. @@ -162,6 +163,10 @@ class BaseConverter(BaseComponent): not one of the valid languages, then it might likely be encoding error resulting in garbled text. :param encoding: Select the file encoding (default is `UTF-8`) + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. """ if isinstance(file_paths, Path): @@ -178,6 +183,7 @@ class BaseConverter(BaseComponent): remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, encoding=encoding, + id_hash_keys=id_hash_keys, ): documents.append(doc) @@ -192,14 +198,20 @@ class BaseConverter(BaseComponent): def run_batch( # type: ignore self, - file_paths: Union[Path, List[Path]], # type: ignore - meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, # type: ignore - remove_numeric_tables: Optional[bool] = None, # type: ignore - valid_languages: Optional[List[str]] = None, # type: ignore + file_paths: Union[Path, List[Path]], + meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, + remove_numeric_tables: Optional[bool] = None, + known_ligatures: Dict[str, str] = KNOWN_LIGATURES, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8", + id_hash_keys: Optional[List[str]] = None, ): return self.run( file_paths=file_paths, meta=meta, remove_numeric_tables=remove_numeric_tables, + known_ligatures=known_ligatures, valid_languages=valid_languages, + encoding=encoding, + id_hash_keys=id_hash_keys, ) diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py index 774d4a66d..61bd7667e 100644 --- a/haystack/nodes/preprocessor/base.py +++ b/haystack/nodes/preprocessor/base.py @@ -20,6 +20,7 @@ class BasePreProcessor(BaseComponent): split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, + id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a @@ -59,6 +60,7 @@ class BasePreProcessor(BaseComponent): split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, + id_hash_keys: Optional[List[str]] = None, ): processed_documents = self.process( documents=documents, @@ -69,6 +71,7 @@ class BasePreProcessor(BaseComponent): split_length=split_length, split_overlap=split_overlap, split_respect_sentence_boundary=split_respect_sentence_boundary, + id_hash_keys=id_hash_keys, ) result = {"documents": processed_documents} return result, "output_1" @@ -83,6 +86,7 @@ class BasePreProcessor(BaseComponent): split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, + id_hash_keys: Optional[List[str]] = None, ): return self.run( documents=documents, @@ -93,4 +97,5 @@ class BasePreProcessor(BaseComponent): split_length=split_length, split_overlap=split_overlap, split_respect_sentence_boundary=split_respect_sentence_boundary, + id_hash_keys=id_hash_keys, ) diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index 0ea44fc22..6f0d0c9b4 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -13,6 +13,7 @@ from haystack.nodes import ( TikaConverter, AzureConverter, ParsrConverter, + TextConverter, ) from ..conftest import SAMPLES_PATH @@ -172,3 +173,18 @@ def test_parsr_converter(): assert docs[1].content_type == "text" assert docs[1].content.startswith("A sample PDF file") assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.") + + +def test_id_hash_keys_from_pipeline_params(): + doc_path = SAMPLES_PATH / "docs" / "doc_1.txt" + meta_1 = {"key": "a"} + meta_2 = {"key": "b"} + meta = [meta_1, meta_2] + + converter = TextConverter() + output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"]) + documents = output["documents"] + unique_ids = set(d.id for d in documents) + + assert len(documents) == 2 + assert len(unique_ids) == 2 diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py index 942d7ec41..685900249 100644 --- a/test/nodes/test_preprocessor.py +++ b/test/nodes/test_preprocessor.py @@ -113,3 +113,17 @@ def test_remove_substrings(): assert "🪲" not in documents[0].content assert "whitespace" in documents[0].content assert "✨" in documents[0].content + + +def test_id_hash_keys_from_pipeline_params(): + document_1 = Document(content="This is a document.", meta={"key": "a"}) + document_2 = Document(content="This is a document.", meta={"key": "b"}) + assert document_1.id == document_2.id + + preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False) + output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"]) + documents = output["documents"] + unique_ids = set(d.id for d in documents) + + assert len(documents) == 4 + assert len(unique_ids) == 4