Standardize TextFileToDocument (#6232)

* simplify textfiletodocument * fix error handling and tests * stray print * reno * streams->sources * reno * feedback * test * fix tests
2025-12-29 07:59:27 +00:00 · 2023-11-17 14:39:39 +00:00 · 2023-11-17 14:39:39 +00:00 · e888852aec
commit e888852aec
parent c26a932423
4 changed files with 76 additions and 402 deletions
--- a/haystack/preview/components/file_converters/txt.py
+++ b/haystack/preview/components/file_converters/txt.py
@ -1,15 +1,9 @@
 import logging
 from pathlib import Path
-from typing import Optional, List, Union, Dict
+from typing import List, Union

-from canals.errors import PipelineRuntimeError
-from tqdm import tqdm
-
-from haystack.preview.lazy_imports import LazyImport
 from haystack.preview import Document, component
-
-with LazyImport("Run 'pip install langdetect'") as langdetect_import:
-    import langdetect
+from haystack.preview.dataclasses import ByteStream


 logger = logging.getLogger(__name__)
@ -21,189 +15,42 @@ class TextFileToDocument:
    A component for converting a text file to a Document.
    """

-    def __init__(
-        self,
-        encoding: str = "utf-8",
-        remove_numeric_tables: bool = False,
-        numeric_row_threshold: float = 0.4,
-        valid_languages: Optional[List[str]] = None,
-        progress_bar: bool = True,
-    ):
+    def __init__(self, encoding: str = "utf-8"):
        """
        Create a TextFileToDocument component.

-        :param encoding: The encoding of the text files. Default: `"utf-8"`
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-            The tabular structures in documents might be noise for reader models if they don't have table parsing
-            capability for finding answers. However, tables may also have long strings that could be possible candidates
-            for answers. The rows containing strings are thus retained in this option. Default: `False`
-        :param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
-            determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
-            numeric words to the total number of words in a line. Default: `0.4`
-        :param valid_languages: Validate languages from a list of languages specified in the
-            [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
-            encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
-            error resulting in garbled text. Default: `None`
-        :param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
+        :param encoding: The default encoding of the text files. Default: `"utf-8"`.
+            Note that if the encoding is specified in the metadata of a ByteStream,
+            it will override this default.
        """
-        langdetect_import.check()
-
        self.encoding = encoding
-        self.remove_numeric_tables = remove_numeric_tables
-        self.numeric_row_threshold = numeric_row_threshold
-        self.valid_languages = valid_languages or []
-        self.progress_bar = progress_bar

    @component.output_types(documents=List[Document])
-    def run(
-        self,
-        paths: List[Union[str, Path]],
-        metadata: Optional[Union[Dict, List[Dict]]] = None,
-        encoding: Optional[str] = None,
-        remove_numeric_tables: Optional[bool] = None,
-        numeric_row_threshold: Optional[float] = None,
-        valid_languages: Optional[List[str]] = None,
-        progress_bar: Optional[bool] = None,
-    ):
+    def run(self, sources: List[Union[str, Path, ByteStream]]):
        """
        Convert text files to Documents.

-        :param paths: A list of paths to text files.
-        :param metadata: Optional metadata to attach to the Documents. If a list is provided, the length of the list
-            must match the number of paths. Default: `None`
-        :param encoding: The encoding of the text files. Default: `"utf-8"`
-        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
-            The tabular structures in documents might be noise for reader models if they don't have table parsing
-            capability for finding answers. However, tables may also have long strings that could be possible candidates
-            for answers. The rows containing strings are thus retained in this option. Default: `False`
-        :param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
-            determine if a line in the provided text file is a numeric table row or not. The value is the ratio of
-            numeric words to the total number of words in a line. Default: `0.4`
-        :param valid_languages: Validate languages from a list of languages specified in the
-            [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)). This option can be used to add a test for
-            encoding errors. If the extracted text is not one of the valid languages, then there might be an encoding
-            error resulting in garbled text. Default: `None`
-        :param progress_bar: Whether to show a progress bar for the conversion process. Default: `True`
+        :param streams: A list of paths to text files or ByteStream objects.
+            Note that if an encoding is specified in the metadata of a ByteStream,
+            it will override the component's default.
+        :return: A dictionary containing the converted documents.
        """
-        if encoding is None:
-            encoding = self.encoding
-        if remove_numeric_tables is None:
-            remove_numeric_tables = self.remove_numeric_tables
-        if numeric_row_threshold is None:
-            numeric_row_threshold = self.numeric_row_threshold
-        if valid_languages is None:
-            valid_languages = self.valid_languages
-        if progress_bar is None:
-            progress_bar = self.progress_bar
-
-        metas = TextFileToDocument._prepare_metadata(metadata, paths)
-
        documents = []
-        for path, meta in tqdm(
-            zip(paths, metas), total=len(paths), desc="Converting text files", disable=not progress_bar
-        ):
+        for source in sources:
+            if isinstance(source, (Path, str)):
+                try:
+                    path = source
+                    source = ByteStream.from_file_path(Path(source))
+                    source.metadata["file_path"] = str(path)
+                except Exception as e:
+                    logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
+                    continue
            try:
-                text = self._read_and_clean_file(
-                    path=path, encoding=encoding, remove_numeric_tables=remove_numeric_tables
-                )
+                encoding = source.metadata.get("encoding", self.encoding)
+                document = Document(content=source.data.decode(encoding))
+                document.meta = source.metadata
+                documents.append(document)
            except Exception as e:
-                logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
-                continue
-
-            if valid_languages is not None and not TextFileToDocument._validate_language(text, valid_languages):
-                logger.warning(
-                    "Text from file %s is not in one of the valid languages: %s. "
-                    "The file may have been decoded incorrectly.",
-                    path,
-                    valid_languages,
-                )
-
-            document = Document(content=text, meta=meta)
-            documents.append(document)
+                logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)

        return {"documents": documents}
-
-    @staticmethod
-    def _prepare_metadata(metadata: Optional[Union[Dict, List[Dict]]], paths: List[Union[str, Path]]) -> List[Dict]:
-        """
-        Prepare the metadata for the Documents.
-
-        :param metadata: The metadata for the Documents.
-        :param paths: The paths to the text files.
-        """
-        if metadata is None:
-            return [{"file_path": str(path)} for path in paths]
-
-        if isinstance(metadata, dict):
-            metadata = [metadata] * len(paths)
-
-        if len(metadata) != len(paths):
-            raise PipelineRuntimeError(
-                f"The number of metadata entries must match the number of paths if metadata is a list. "
-                f"Number of paths: {len(paths)}, number of metadata entries: {len(metadata)}."
-            )
-
-        return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(metadata, paths)]
-
-    def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
-        """
-        Read and clean the text file.
-
-        :param path: The path to the text file.
-        :param encoding: The encoding of the text file.
-        :param remove_numeric_tables: Whether to remove numeric tables.
-
-        :return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
-        """
-        if not Path(path).exists():
-            raise PipelineRuntimeError(f"File at path {path} does not exist.")
-
-        with open(path, encoding=encoding) as file:
-            text = file.read()
-            pages = text.split("\f")
-            cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
-            return "\f".join(cleaned_pages)
-
-    def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
-        """
-        Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
-
-        :param page: The content of a page of a text file.
-        :param remove_numeric_tables: Whether to remove numeric tables.
-
-        :return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
-        """
-        cleaned_lines = page.splitlines()
-        if remove_numeric_tables:
-            cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
-
-        return "\n".join(cleaned_lines)
-
-    def _is_numeric_row(self, line: str) -> bool:
-        """
-        Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
-        than 40% digits and does not end with a period.
-
-        :param line: The content of a line of a text file.
-        """
-        words = line.split()
-        digits = [word for word in words if any(char.isdigit() for char in word)]
-        return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
-
-    @staticmethod
-    def _validate_language(text: str, valid_languages: List[str]) -> bool:
-        """
-        Validate if the detected language of the text is one of the valid languages.
-
-        :param text: The text to validate.
-        :param valid_languages: A list of valid languages.
-        """
-        if not valid_languages:
-            return True
-
-        try:
-            lang = langdetect.detect(text)
-        except langdetect.lang_detect_exception.LangDetectException:
-            lang = None
-
-        return lang in valid_languages
--- a/releasenotes/notes/simplify-textfiletodocument-c9d2fb7ed2c848ed.yaml
+++ b/releasenotes/notes/simplify-textfiletodocument-c9d2fb7ed2c848ed.yaml
@ -0,0 +1,3 @@
+preview:
+  - Remove most parameters from TextFileToDocument to make it match all other converters.
+  - Add support for ByteStreams
--- a/test/preview/components/file_converters/test_textfile_to_document.py
+++ b/test/preview/components/file_converters/test_textfile_to_document.py
@ -4,253 +4,66 @@ from pathlib import Path

 import pytest

-from canals.errors import PipelineRuntimeError
-from langdetect import LangDetectException
-
+from haystack.preview.dataclasses import ByteStream
 from haystack.preview.components.file_converters.txt import TextFileToDocument


-class TestTextfileToDocument:  # pylint: disable=R0904
+class TestTextfileToDocument:
    @pytest.mark.unit
    def test_run(self, preview_samples_path):
        """
        Test if the component runs correctly.
        """
-        paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
+        bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_3.txt")
+        bytestream.metadata["file_path"] = str(preview_samples_path / "txt" / "doc_3.txt")
+        bytestream.metadata["key"] = "value"
+        files = [
+            str(preview_samples_path / "txt" / "doc_1.txt"),
+            preview_samples_path / "txt" / "doc_2.txt",
+            bytestream,
+        ]
        converter = TextFileToDocument()
-        output = converter.run(paths=paths)
+        output = converter.run(sources=files)
        docs = output["documents"]
-        assert len(docs) == 2
-        assert docs[0].content == "Some text for testing.\nTwo lines in here."
-        assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
-        assert docs[0].meta["file_path"] == str(paths[0])
-        assert docs[1].meta["file_path"] == str(paths[1])
-
-    @pytest.mark.unit
-    def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
-        file_path = preview_samples_path / "txt" / "doc_1.txt"
-        converter = TextFileToDocument()
-        with patch(
-            "haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"
-        ), caplog.at_level(logging.WARNING):
-            output = converter.run(paths=[file_path], valid_languages=["de"])
-            assert (
-                f"Text from file {file_path} is not in one of the valid languages: ['de']. "
-                f"The file may have been decoded incorrectly." in caplog.text
-            )
-
-        docs = output["documents"]
-        assert len(docs) == 1
-        assert docs[0].content == "Some text for testing.\nTwo lines in here."
+        assert len(docs) == 3
+        assert "Some text for testing." in docs[0].content
+        assert "This is a test line." in docs[1].content
+        assert "That's yet another file!" in docs[2].content
+        assert docs[0].meta["file_path"] == str(files[0])
+        assert docs[1].meta["file_path"] == str(files[1])
+        assert docs[2].meta == bytestream.metadata

    @pytest.mark.unit
    def test_run_error_handling(self, preview_samples_path, caplog):
        """
        Test if the component correctly handles errors.
        """
-        paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
+        paths = [
+            preview_samples_path / "txt" / "doc_1.txt",
+            "non_existing_file.txt",
+            preview_samples_path / "txt" / "doc_3.txt",
+        ]
        converter = TextFileToDocument()
        with caplog.at_level(logging.WARNING):
-            output = converter.run(paths=paths)
-            assert (
-                "Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
-                in caplog.text
-            )
+            output = converter.run(sources=paths)
+            assert "non_existing_file.txt" in caplog.text
        docs = output["documents"]
-        assert len(docs) == 1
+        assert len(docs) == 2
        assert docs[0].meta["file_path"] == str(paths[0])
+        assert docs[1].meta["file_path"] == str(paths[2])

    @pytest.mark.unit
-    def test_prepare_metadata_no_metadata(self):
+    def test_encoding_override(self, preview_samples_path):
        """
-        Test if the metadata is correctly prepared when no custom metadata is provided.
+        Test if the encoding metadata field is used properly
        """
-        converter = TextFileToDocument()
-        meta = converter._prepare_metadata(
-            metadata=None, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
-        )
-        assert len(meta) == 2
-        assert meta[0]["file_path"] == "data/sample_path_1.txt"
-        assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
+        bytestream = ByteStream.from_file_path(preview_samples_path / "txt" / "doc_1.txt")
+        bytestream.metadata["key"] = "value"

-    @pytest.mark.unit
-    def test_prepare_metadata_single_dict(self):
-        """
-        Test if the metadata is correctly prepared when a single dict is provided.
-        """
-        converter = TextFileToDocument()
-        meta = converter._prepare_metadata(
-            metadata={"name": "test"}, paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
-        )
-        assert len(meta) == 2
-        assert meta[0]["file_path"] == "data/sample_path_1.txt"
-        assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
-        assert meta[0]["name"] == "test"
-        assert meta[1]["name"] == "test"
+        converter = TextFileToDocument(encoding="utf-16")
+        output = converter.run(sources=[bytestream])
+        assert "Some text for testing." not in output["documents"][0].content

-    @pytest.mark.unit
-    def test_prepare_metadata_list_of_dicts(self):
-        """
-        Test if the metadata is correctly prepared when a list of dicts is provided.
-        """
-        converter = TextFileToDocument()
-        meta = converter._prepare_metadata(
-            metadata=[{"name": "test1"}, {"name": "test2"}],
-            paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
-        )
-        assert len(meta) == 2
-        assert meta[0]["file_path"] == "data/sample_path_1.txt"
-        assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
-        assert meta[0]["name"] == "test1"
-        assert meta[1]["name"] == "test2"
-
-    @pytest.mark.unit
-    def test_prepare_metadata_unmatching_list_len(self):
-        """
-        Test if an error is raised when the number of metadata dicts is not equal to the number of
-        file paths.
-        """
-        converter = TextFileToDocument()
-        with pytest.raises(
-            PipelineRuntimeError,
-            match="The number of metadata entries must match the number of paths if metadata is a list.",
-        ):
-            converter._prepare_metadata(
-                metadata=[{"name": "test1"}, {"name": "test2"}],
-                paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
-            )
-
-    @pytest.mark.unit
-    def test_read_and_clean_file(self, preview_samples_path):
-        """
-        Test if the file is correctly read.
-        """
-        file_path = preview_samples_path / "txt" / "doc_1.txt"
-        converter = TextFileToDocument()
-        text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
-        assert text == "Some text for testing.\nTwo lines in here."
-
-    @pytest.mark.unit
-    def test_read_and_clean_file_non_existing_file(self):
-        """
-        Test if an error is raised when the file does not exist.
-        """
-        converter = TextFileToDocument()
-        file_path = "non_existing_file.txt"
-        with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
-            converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
-
-    @pytest.mark.unit
-    def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
-        """
-        Test if the file is correctly read and numeric tables are removed.
-        """
-        file_path = preview_samples_path / "txt" / "doc_2.txt"
-        converter = TextFileToDocument()
-        text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
-        assert text == "This is a test line.\n987 654 321."
-
-    @pytest.mark.unit
-    def test_clean_page_without_remove_numeric_tables(self):
-        """
-        Test if the page is not changed when remove_numeric_tables is False.
-        """
-        converter = TextFileToDocument()
-        page = "This is a test line.\n123 456 789"
-        cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
-        assert cleaned_page == page
-
-    @pytest.mark.unit
-    def test_clean_page_with_remove_numeric_tables(self):
-        """
-        Test if the page is correctly cleaned when remove_numeric_tables is True.
-        """
-        converter = TextFileToDocument()
-        page = "This is a test line.\n123 456 789"
-        cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
-        assert cleaned_page == "This is a test line."
-
-    @pytest.mark.unit
-    def test_is_numeric_row_only_numbers(self):
-        """
-        Test if the line is correctly identified as a numeric row when it only contains numbers.
-        """
-        converter = TextFileToDocument()
-        line = "123 456 789"
-        assert converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_is_numeric_row_only_text(self):
-        """
-        Test if the line is correctly identified as a non-numeric row when it only contains text.
-        """
-        converter = TextFileToDocument()
-        line = "This is a test line."
-        assert not converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_is_numeric_row_only_numbers_with_period(self):
-        """
-        Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
-        the end.
-        """
-        converter = TextFileToDocument()
-        line = "123 456 789."
-        assert not converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_is_numeric_row_more_numbers_than_text(self):
-        """
-        Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
-        """
-        converter = TextFileToDocument()
-        line = "123 456 789 This is a test"
-        assert converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_is_numeric_row_less_numbers_than_text(self):
-        """
-        Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
-        """
-        converter = TextFileToDocument()
-        line = "123 456 789 This is a test line"
-        assert not converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_is_numeric_row_words_consist_of_numbers_and_text(self):
-        """
-        Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
-        """
-        converter = TextFileToDocument()
-        line = "123eur 456usd"
-        assert converter._is_numeric_row(line=line)
-
-    @pytest.mark.unit
-    def test_validate_language(self):
-        """
-        Test if the language is correctly validated.
-        """
-        converter = TextFileToDocument()
-        with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
-            assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
-            assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
-
-    @pytest.mark.unit
-    def test_validate_language_no_languages_specified(self):
-        """
-        Test if _validate_languages returns True when no languages are specified.
-        """
-        converter = TextFileToDocument()
-        assert converter._validate_language(text="This is an english test.", valid_languages=[])
-
-    @pytest.mark.unit
-    def test_validate_language_lang_detect_exception(self):
-        """
-        Test if _validate_languages returns False when langdetect throws an exception.
-        """
-        converter = TextFileToDocument()
-        with patch(
-            "haystack.preview.components.file_converters.txt.langdetect.detect",
-            side_effect=LangDetectException(code=0, message="Test"),
-        ):
-            assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])
+        bytestream.metadata["encoding"] = "utf-8"
+        output = converter.run(sources=[bytestream])
+        assert "Some text for testing." in output["documents"][0].content
--- a/test/preview/test_files/txt/doc_3.txt
+++ b/test/preview/test_files/txt/doc_3.txt
@ -0,0 +1,11 @@
+That's yet another file!
+
+it contains
+
+
+
+
+many
+
+
+empty lines.