From a51ca19fe43e89e1e050f8a6708c025052354601 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Tue, 1 Aug 2023 11:34:52 +0200 Subject: [PATCH] feat: Add `TextFileToDocument` component (v2) (#5467) * Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina --- haystack/preview/components/__init__.py | 1 + .../components/file_converters/__init__.py | 1 + .../preview/components/file_converters/txt.py | 245 +++++++++++++++++ ...tfile-to-document-v2-341987623765ec95.yaml | 4 + .../components/file_converters/__init__.py | 0 .../test_textfile_to_document.py | 256 ++++++++++++++++++ test/preview/test_files/txt/doc_1.txt | 2 + test/preview/test_files/txt/doc_2.txt | 3 + 8 files changed, 512 insertions(+) create mode 100644 haystack/preview/components/file_converters/__init__.py create mode 100644 haystack/preview/components/file_converters/txt.py create mode 100644 releasenotes/notes/textfile-to-document-v2-341987623765ec95.yaml create mode 100644 test/preview/components/file_converters/__init__.py create mode 100644 test/preview/components/file_converters/test_textfile_to_document.py create mode 100644 test/preview/test_files/txt/doc_1.txt create mode 100644 test/preview/test_files/txt/doc_2.txt diff --git a/haystack/preview/components/__init__.py b/haystack/preview/components/__init__.py index 889df06cc..aef8809c9 100644 --- a/haystack/preview/components/__init__.py +++ b/haystack/preview/components/__init__.py @@ -1,2 +1,3 @@ from haystack.preview.components.audio.whisper_local import LocalWhisperTranscriber from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber +from haystack.preview.components.file_converters import TextFileToDocument diff --git a/haystack/preview/components/file_converters/__init__.py b/haystack/preview/components/file_converters/__init__.py new file mode 100644 index 000000000..8663922cd --- /dev/null +++ b/haystack/preview/components/file_converters/__init__.py @@ -0,0 +1 @@ +from haystack.preview.components.file_converters.txt import TextFileToDocument diff --git a/haystack/preview/components/file_converters/txt.py b/haystack/preview/components/file_converters/txt.py new file mode 100644 index 000000000..f2d05984f --- /dev/null +++ b/haystack/preview/components/file_converters/txt.py @@ -0,0 +1,245 @@ +import logging +from pathlib import Path +from typing import Optional, List, Union, Dict + +from canals.errors import PipelineRuntimeError +from tqdm import tqdm + +from haystack import Document +from haystack.lazy_imports import LazyImport +from haystack.preview import component + +with LazyImport("Run 'pip install farm-haystack[preprocessing]'") as langdetect_import: + import langdetect + + +logger = logging.getLogger(__name__) + + +@component +class TextFileToDocument: + """ + A component for converting a text file to a Document. + """ + + @component.input + def input(self): + class Input: + """ + Input data for the TextFileToDocument component. + + :param paths: A list of paths to text files. + :param meta: Optional metadata to attach to the Documents. If a list is provided, the length of the list + must match the number of paths. + Default: `None` + :param encoding: The encoding of the text files. Default: `"utf-8"` + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for reader models if they + don't have table parsing capability for finding answers. However, tables + may also have long strings that could be possible candidates for answers. + The rows containing strings are thus retained in this option. + Default: `False` + :param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). + This option can be used to add a test for encoding errors. If the extracted text is + not one of the valid languages, then there might be an encoding error resulting + in garbled text. + Default: `None` + :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's + attributes. If you want to ensure you don't have duplicate Documents in your + DocumentStore but texts are not unique, you can modify the metadata and pass e.g. + `"meta"` to this field (for example `["content", "meta"]`). + In this case the ID will be generated by using the content and the defined metadata. + Default: `None` + :param progress_bar: Whether to show a progress bar for the conversion process. + Default: `True` + """ + + paths: List[Union[str, Path]] + meta: Optional[Union[Dict, List[Dict]]] + encoding: Optional[str] + remove_numeric_tables: Optional[bool] + valid_languages: Optional[List[str]] + id_hash_keys: Optional[List[str]] + progress_bar: Optional[bool] + + return Input + + @component.output + def output(self): + class Output: + """ + Output data from the TextFileToDocument component. + + :param documents: The converted documents. + """ + + documents: List[Document] + + return Output + + def __init__( + self, + encoding: str = "utf-8", + remove_numeric_tables: bool = False, + numeric_row_threshold: float = 0.4, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True, + ): + """ + Create a TextFileToDocument component. + + :param encoding: The encoding of the text files. Default: `"utf-8"` + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for reader models if they + don't have table parsing capability for finding answers. However, tables + may also have long strings that could be possible candidates for answers. + The rows containing strings are thus retained in this option. + Default: `False` + :param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to + determine if a line in the provided text file is a numeric table row or not. + The value is the ratio of numeric words to the total number of words in a line. + :param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). + This option can be used to add a test for encoding errors. If the extracted text is + not one of the valid languages, then there might be an encoding error resulting + in garbled text. + Default: `None` + :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's + attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore + but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field + (for example `["content", "meta"]`). In this case the ID will be generated by using the + content and the defined metadata. Default: `None` + :param progress_bar: Whether to show a progress bar for the conversion process. + Default: `True` + """ + langdetect_import.check() + + self.defaults = { + "encoding": encoding, + "remove_numeric_tables": remove_numeric_tables, + "valid_languages": valid_languages, + "id_hash_keys": id_hash_keys, + "progress_bar": progress_bar, + } + self.numeric_row_threshold = numeric_row_threshold + + def run(self, data): + """ + Convert text files to Documents. + + :param data: Input data for the TextFileToDocument component. + """ + file_paths = data.paths + metas = TextFileToDocument._prepare_metadata(data.meta, file_paths) + + documents = [] + for path, meta in tqdm( + zip(file_paths, metas), total=len(file_paths), desc="Converting text files", disable=not data.progress_bar + ): + try: + text = self._read_and_clean_file( + path=path, encoding=data.encoding, remove_numeric_tables=data.remove_numeric_tables + ) + except Exception as e: + logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e) + continue + + if data.valid_languages is not None and not TextFileToDocument._validate_language( + text, data.valid_languages + ): + logger.warning( + "Text from file %s is not in one of the valid languages: %s. " + "The file may have been decoded incorrectly.", + path, + data.valid_languages, + ) + + document = Document(content=text, meta=meta, id_hash_keys=data.id_hash_keys) + documents.append(document) + + return self.output(documents=documents) + + @staticmethod + def _prepare_metadata(meta: Optional[Union[Dict, List[Dict]]], file_paths: List[Union[str, Path]]) -> List[Dict]: + """ + Prepare the metadata for the Documents. + + :param meta: The metadata for the Documents. + :param file_paths: The paths to the text files. + """ + if meta is None: + return [{"file_path": str(path)} for path in file_paths] + + if isinstance(meta, dict): + meta = [meta] * len(file_paths) + + if len(meta) != len(file_paths): + raise PipelineRuntimeError( + f"The number of meta entries must match the number of paths if meta is a list. " + f"Number of paths: {len(file_paths)}, number of meta entries: {len(meta)}." + ) + + return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(meta, file_paths)] + + def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str: + """ + Read and clean the text file. + + :param path: The path to the text file. + :param encoding: The encoding of the text file. + :param remove_numeric_tables: Whether to remove numeric tables. + + :return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`. + """ + if not Path(path).exists(): + raise PipelineRuntimeError(f"File at path {path} does not exist.") + + with open(path, encoding=encoding) as file: + text = file.read() + pages = text.split("\f") + cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages] + return "\f".join(cleaned_pages) + + def _clean_page(self, page: str, remove_numeric_tables: bool) -> str: + """ + Clean a page of text from numeric tables if `remove_numeric_tables` is `True`. + + :param page: The content of a page of a text file. + :param remove_numeric_tables: Whether to remove numeric tables. + + :return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`. + """ + cleaned_lines = page.splitlines() + if remove_numeric_tables: + cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)] + + return "\n".join(cleaned_lines) + + def _is_numeric_row(self, line: str) -> bool: + """ + Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more + than 40% digits and does not end with a period. + + :param line: The content of a line of a text file. + """ + words = line.split() + digits = [word for word in words if any(char.isdigit() for char in word)] + return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".") + + @staticmethod + def _validate_language(text: str, valid_languages: List[str]) -> bool: + """ + Validate if the detected language of the text is one of the valid languages. + + :param text: The text to validate. + :param valid_languages: A list of valid languages. + """ + if not valid_languages: + return True + + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: + lang = None + + return lang in valid_languages diff --git a/releasenotes/notes/textfile-to-document-v2-341987623765ec95.yaml b/releasenotes/notes/textfile-to-document-v2-341987623765ec95.yaml new file mode 100644 index 000000000..d8f4496bb --- /dev/null +++ b/releasenotes/notes/textfile-to-document-v2-341987623765ec95.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add new TextFileToDocument component to Haystack v2 preview so that text files can be converted to Haystack Documents. diff --git a/test/preview/components/file_converters/__init__.py b/test/preview/components/file_converters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/preview/components/file_converters/test_textfile_to_document.py b/test/preview/components/file_converters/test_textfile_to_document.py new file mode 100644 index 000000000..e2783ed32 --- /dev/null +++ b/test/preview/components/file_converters/test_textfile_to_document.py @@ -0,0 +1,256 @@ +import logging +from unittest.mock import patch + +import pytest +from pathlib import Path + +from canals.errors import PipelineRuntimeError +from langdetect import LangDetectException + +from haystack.preview.components.file_converters.txt import TextFileToDocument +from test.preview.components.base import BaseTestComponent +from test.conftest import preview_samples_path + + +class TestTextfileToDocument(BaseTestComponent): + @pytest.mark.unit + def test_run(self, preview_samples_path): + """ + Test if the component runs correctly. + """ + file_paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"] + converter = TextFileToDocument() + output = converter.run(data=converter.input(paths=file_paths)) + docs = output.documents + assert len(docs) == 2 + assert docs[0].content == "Some text for testing.\nTwo lines in here." + assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321." + assert docs[0].meta["file_path"] == str(file_paths[0]) + assert docs[1].meta["file_path"] == str(file_paths[1]) + + @pytest.mark.unit + def test_run_warning_for_invalid_language(self, preview_samples_path, caplog): + file_path = preview_samples_path / "txt" / "doc_1.txt" + converter = TextFileToDocument() + with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"): + with caplog.at_level(logging.WARNING): + output = converter.run(data=converter.input(paths=[file_path], valid_languages=["de"])) + assert ( + f"Text from file {file_path} is not in one of the valid languages: ['de']. " + f"The file may have been decoded incorrectly." in caplog.text + ) + + docs = output.documents + assert len(docs) == 1 + assert docs[0].content == "Some text for testing.\nTwo lines in here." + + @pytest.mark.unit + def test_run_error_handling(self, preview_samples_path, caplog): + """ + Test if the component correctly handles errors. + """ + file_paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"] + converter = TextFileToDocument() + with caplog.at_level(logging.WARNING): + output = converter.run(data=converter.input(paths=file_paths)) + assert ( + "Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist." + in caplog.text + ) + docs = output.documents + assert len(docs) == 1 + assert docs[0].meta["file_path"] == str(file_paths[0]) + + @pytest.mark.unit + def test_prepare_metadata_no_metadata(self): + """ + Test if the metadata is correctly prepared when no custom metadata is provided. + """ + converter = TextFileToDocument() + meta = converter._prepare_metadata( + meta=None, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")] + ) + assert len(meta) == 2 + assert meta[0]["file_path"] == "data/sample_path_1.txt" + assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt")) + + @pytest.mark.unit + def test_prepare_metadata_single_dict(self): + """ + Test if the metadata is correctly prepared when a single dict is provided. + """ + converter = TextFileToDocument() + meta = converter._prepare_metadata( + meta={"name": "test"}, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")] + ) + assert len(meta) == 2 + assert meta[0]["file_path"] == "data/sample_path_1.txt" + assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt")) + assert meta[0]["name"] == "test" + assert meta[1]["name"] == "test" + + @pytest.mark.unit + def test_prepare_metadata_list_of_dicts(self): + """ + Test if the metadata is correctly prepared when a list of dicts is provided. + """ + converter = TextFileToDocument() + meta = converter._prepare_metadata( + meta=[{"name": "test1"}, {"name": "test2"}], + file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")], + ) + assert len(meta) == 2 + assert meta[0]["file_path"] == "data/sample_path_1.txt" + assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt")) + assert meta[0]["name"] == "test1" + assert meta[1]["name"] == "test2" + + @pytest.mark.unit + def test_prepare_metadata_unmatching_list_len(self): + """ + Test if an error is raised when the number of metadata dicts is not equal to the number of + file paths. + """ + converter = TextFileToDocument() + with pytest.raises( + PipelineRuntimeError, match="The number of meta entries must match the number of paths if meta is a list." + ): + converter._prepare_metadata( + meta=[{"name": "test1"}, {"name": "test2"}], + file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"], + ) + + @pytest.mark.unit + def test_read_and_clean_file(self, preview_samples_path): + """ + Test if the file is correctly read. + """ + file_path = preview_samples_path / "txt" / "doc_1.txt" + converter = TextFileToDocument() + text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False) + assert text == "Some text for testing.\nTwo lines in here." + + @pytest.mark.unit + def test_read_and_clean_file_non_existing_file(self): + """ + Test if an error is raised when the file does not exist. + """ + converter = TextFileToDocument() + file_path = "non_existing_file.txt" + with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."): + converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False) + + @pytest.mark.unit + def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path): + """ + Test if the file is correctly read and numeric tables are removed. + """ + file_path = preview_samples_path / "txt" / "doc_2.txt" + converter = TextFileToDocument() + text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True) + assert text == "This is a test line.\n987 654 321." + + @pytest.mark.unit + def test_clean_page_without_remove_numeric_tables(self): + """ + Test if the page is not changed when remove_numeric_tables is False. + """ + converter = TextFileToDocument() + page = "This is a test line.\n123 456 789" + cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False) + assert cleaned_page == page + + @pytest.mark.unit + def test_clean_page_with_remove_numeric_tables(self): + """ + Test if the page is correctly cleaned when remove_numeric_tables is True. + """ + converter = TextFileToDocument() + page = "This is a test line.\n123 456 789" + cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True) + assert cleaned_page == "This is a test line." + + @pytest.mark.unit + def test_is_numeric_row_only_numbers(self): + """ + Test if the line is correctly identified as a numeric row when it only contains numbers. + """ + converter = TextFileToDocument() + line = "123 456 789" + assert converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_is_numeric_row_only_text(self): + """ + Test if the line is correctly identified as a non-numeric row when it only contains text. + """ + converter = TextFileToDocument() + line = "This is a test line." + assert not converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_is_numeric_row_only_numbers_with_period(self): + """ + Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at + the end. + """ + converter = TextFileToDocument() + line = "123 456 789." + assert not converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_is_numeric_row_more_numbers_than_text(self): + """ + Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than. + """ + converter = TextFileToDocument() + line = "123 456 789 This is a test" + assert converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_is_numeric_row_less_numbers_than_text(self): + """ + Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than. + """ + converter = TextFileToDocument() + line = "123 456 789 This is a test line" + assert not converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_is_numeric_row_words_consist_of_numbers_and_text(self): + """ + Test if the line is correctly identified as a numeric row when the words consist of numbers and text. + """ + converter = TextFileToDocument() + line = "123eur 456usd" + assert converter._is_numeric_row(line=line) + + @pytest.mark.unit + def test_validate_language(self): + """ + Test if the language is correctly validated. + """ + converter = TextFileToDocument() + with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"): + assert converter._validate_language(text="This is an english text.", valid_languages=["en"]) + assert not converter._validate_language(text="This is an english text.", valid_languages=["de"]) + + @pytest.mark.unit + def test_validate_language_no_languages_specified(self): + """ + Test if _validate_languages returns True when no languages are specified. + """ + converter = TextFileToDocument() + assert converter._validate_language(text="This is an english test.", valid_languages=[]) + + @pytest.mark.unit + def test_validate_language_lang_detect_exception(self): + """ + Test if _validate_languages returns False when langdetect throws an exception. + """ + converter = TextFileToDocument() + with patch( + "haystack.preview.components.file_converters.txt.langdetect.detect", + side_effect=LangDetectException(code=0, message="Test"), + ): + assert not converter._validate_language(text="This is an english text.", valid_languages=["en"]) diff --git a/test/preview/test_files/txt/doc_1.txt b/test/preview/test_files/txt/doc_1.txt new file mode 100644 index 000000000..412189080 --- /dev/null +++ b/test/preview/test_files/txt/doc_1.txt @@ -0,0 +1,2 @@ +Some text for testing. +Two lines in here. diff --git a/test/preview/test_files/txt/doc_2.txt b/test/preview/test_files/txt/doc_2.txt new file mode 100644 index 000000000..6f950eedc --- /dev/null +++ b/test/preview/test_files/txt/doc_2.txt @@ -0,0 +1,3 @@ +This is a test line. +123 456 789 +987 654 321.