mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 12:07:04 +00:00
feat: Add TextFileToDocument component (v2) (#5467)
* Add TextfileToDocument component * Add docstrings * Add unit tests * Add release note file * Make use of progress bar * Add TextfileToDocument to __init__.py * Use lazy % formatting in logging functions * Remove f from non-f-string * Add TextfileToDocument to __init__.py * Use correct dependency extra * Compare file path against path object * PR feedback * PR feedback * Update haystack/preview/components/file_converters/txt.py Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Update docstrings * Add error handling * Add unit test * Reintroduce falsely removed caplog --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
8920fd6939
commit
a51ca19fe4
@ -1,2 +1,3 @@
|
||||
from haystack.preview.components.audio.whisper_local import LocalWhisperTranscriber
|
||||
from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber
|
||||
from haystack.preview.components.file_converters import TextFileToDocument
|
||||
|
||||
1
haystack/preview/components/file_converters/__init__.py
Normal file
1
haystack/preview/components/file_converters/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from haystack.preview.components.file_converters.txt import TextFileToDocument
|
||||
245
haystack/preview/components/file_converters/txt.py
Normal file
245
haystack/preview/components/file_converters/txt.py
Normal file
@ -0,0 +1,245 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Union, Dict
|
||||
|
||||
from canals.errors import PipelineRuntimeError
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack import Document
|
||||
from haystack.lazy_imports import LazyImport
|
||||
from haystack.preview import component
|
||||
|
||||
with LazyImport("Run 'pip install farm-haystack[preprocessing]'") as langdetect_import:
|
||||
import langdetect
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class TextFileToDocument:
|
||||
"""
|
||||
A component for converting a text file to a Document.
|
||||
"""
|
||||
|
||||
@component.input
|
||||
def input(self):
|
||||
class Input:
|
||||
"""
|
||||
Input data for the TextFileToDocument component.
|
||||
|
||||
:param paths: A list of paths to text files.
|
||||
:param meta: Optional metadata to attach to the Documents. If a list is provided, the length of the list
|
||||
must match the number of paths.
|
||||
Default: `None`
|
||||
:param encoding: The encoding of the text files. Default: `"utf-8"`
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for reader models if they
|
||||
don't have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could be possible candidates for answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
Default: `False`
|
||||
:param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)).
|
||||
This option can be used to add a test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then there might be an encoding error resulting
|
||||
in garbled text.
|
||||
Default: `None`
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. If you want to ensure you don't have duplicate Documents in your
|
||||
DocumentStore but texts are not unique, you can modify the metadata and pass e.g.
|
||||
`"meta"` to this field (for example `["content", "meta"]`).
|
||||
In this case the ID will be generated by using the content and the defined metadata.
|
||||
Default: `None`
|
||||
:param progress_bar: Whether to show a progress bar for the conversion process.
|
||||
Default: `True`
|
||||
"""
|
||||
|
||||
paths: List[Union[str, Path]]
|
||||
meta: Optional[Union[Dict, List[Dict]]]
|
||||
encoding: Optional[str]
|
||||
remove_numeric_tables: Optional[bool]
|
||||
valid_languages: Optional[List[str]]
|
||||
id_hash_keys: Optional[List[str]]
|
||||
progress_bar: Optional[bool]
|
||||
|
||||
return Input
|
||||
|
||||
@component.output
|
||||
def output(self):
|
||||
class Output:
|
||||
"""
|
||||
Output data from the TextFileToDocument component.
|
||||
|
||||
:param documents: The converted documents.
|
||||
"""
|
||||
|
||||
documents: List[Document]
|
||||
|
||||
return Output
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoding: str = "utf-8",
|
||||
remove_numeric_tables: bool = False,
|
||||
numeric_row_threshold: float = 0.4,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
):
|
||||
"""
|
||||
Create a TextFileToDocument component.
|
||||
|
||||
:param encoding: The encoding of the text files. Default: `"utf-8"`
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for reader models if they
|
||||
don't have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could be possible candidates for answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
Default: `False`
|
||||
:param numeric_row_threshold: Applicable if `remove_numeric_tables` is set to `True`. This is the threshold to
|
||||
determine if a line in the provided text file is a numeric table row or not.
|
||||
The value is the ratio of numeric words to the total number of words in a line.
|
||||
:param valid_languages: Validate languages from a list of languages specified in the [ISO 639-1 format]((https://en.wikipedia.org/wiki/ISO_639-1)).
|
||||
This option can be used to add a test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then there might be an encoding error resulting
|
||||
in garbled text.
|
||||
Default: `None`
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. If you want to ensure you don't have duplicate Documents in your DocumentStore
|
||||
but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field
|
||||
(for example `["content", "meta"]`). In this case the ID will be generated by using the
|
||||
content and the defined metadata. Default: `None`
|
||||
:param progress_bar: Whether to show a progress bar for the conversion process.
|
||||
Default: `True`
|
||||
"""
|
||||
langdetect_import.check()
|
||||
|
||||
self.defaults = {
|
||||
"encoding": encoding,
|
||||
"remove_numeric_tables": remove_numeric_tables,
|
||||
"valid_languages": valid_languages,
|
||||
"id_hash_keys": id_hash_keys,
|
||||
"progress_bar": progress_bar,
|
||||
}
|
||||
self.numeric_row_threshold = numeric_row_threshold
|
||||
|
||||
def run(self, data):
|
||||
"""
|
||||
Convert text files to Documents.
|
||||
|
||||
:param data: Input data for the TextFileToDocument component.
|
||||
"""
|
||||
file_paths = data.paths
|
||||
metas = TextFileToDocument._prepare_metadata(data.meta, file_paths)
|
||||
|
||||
documents = []
|
||||
for path, meta in tqdm(
|
||||
zip(file_paths, metas), total=len(file_paths), desc="Converting text files", disable=not data.progress_bar
|
||||
):
|
||||
try:
|
||||
text = self._read_and_clean_file(
|
||||
path=path, encoding=data.encoding, remove_numeric_tables=data.remove_numeric_tables
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e)
|
||||
continue
|
||||
|
||||
if data.valid_languages is not None and not TextFileToDocument._validate_language(
|
||||
text, data.valid_languages
|
||||
):
|
||||
logger.warning(
|
||||
"Text from file %s is not in one of the valid languages: %s. "
|
||||
"The file may have been decoded incorrectly.",
|
||||
path,
|
||||
data.valid_languages,
|
||||
)
|
||||
|
||||
document = Document(content=text, meta=meta, id_hash_keys=data.id_hash_keys)
|
||||
documents.append(document)
|
||||
|
||||
return self.output(documents=documents)
|
||||
|
||||
@staticmethod
|
||||
def _prepare_metadata(meta: Optional[Union[Dict, List[Dict]]], file_paths: List[Union[str, Path]]) -> List[Dict]:
|
||||
"""
|
||||
Prepare the metadata for the Documents.
|
||||
|
||||
:param meta: The metadata for the Documents.
|
||||
:param file_paths: The paths to the text files.
|
||||
"""
|
||||
if meta is None:
|
||||
return [{"file_path": str(path)} for path in file_paths]
|
||||
|
||||
if isinstance(meta, dict):
|
||||
meta = [meta] * len(file_paths)
|
||||
|
||||
if len(meta) != len(file_paths):
|
||||
raise PipelineRuntimeError(
|
||||
f"The number of meta entries must match the number of paths if meta is a list. "
|
||||
f"Number of paths: {len(file_paths)}, number of meta entries: {len(meta)}."
|
||||
)
|
||||
|
||||
return [{**m, "file_path": m.get("file_path", str(path))} for m, path in zip(meta, file_paths)]
|
||||
|
||||
def _read_and_clean_file(self, path: Union[str, Path], encoding: str, remove_numeric_tables: bool) -> str:
|
||||
"""
|
||||
Read and clean the text file.
|
||||
|
||||
:param path: The path to the text file.
|
||||
:param encoding: The encoding of the text file.
|
||||
:param remove_numeric_tables: Whether to remove numeric tables.
|
||||
|
||||
:return: The text of the file cleaned from numeric tables if `remove_numeric_tables` is `True`.
|
||||
"""
|
||||
if not Path(path).exists():
|
||||
raise PipelineRuntimeError(f"File at path {path} does not exist.")
|
||||
|
||||
with open(path, encoding=encoding) as file:
|
||||
text = file.read()
|
||||
pages = text.split("\f")
|
||||
cleaned_pages = [self._clean_page(page, remove_numeric_tables) for page in pages]
|
||||
return "\f".join(cleaned_pages)
|
||||
|
||||
def _clean_page(self, page: str, remove_numeric_tables: bool) -> str:
|
||||
"""
|
||||
Clean a page of text from numeric tables if `remove_numeric_tables` is `True`.
|
||||
|
||||
:param page: The content of a page of a text file.
|
||||
:param remove_numeric_tables: Whether to remove numeric tables.
|
||||
|
||||
:return: The text from the page cleaned from numeric tables if `remove_numeric_tables` is `True`.
|
||||
"""
|
||||
cleaned_lines = page.splitlines()
|
||||
if remove_numeric_tables:
|
||||
cleaned_lines = [line for line in cleaned_lines if not self._is_numeric_row(line)]
|
||||
|
||||
return "\n".join(cleaned_lines)
|
||||
|
||||
def _is_numeric_row(self, line: str) -> bool:
|
||||
"""
|
||||
Check if a line of a text file is a numeric row. A line is considered a numeric row if it contains more
|
||||
than 40% digits and does not end with a period.
|
||||
|
||||
:param line: The content of a line of a text file.
|
||||
"""
|
||||
words = line.split()
|
||||
digits = [word for word in words if any(char.isdigit() for char in word)]
|
||||
return len(digits) / len(words) > self.numeric_row_threshold and not line.strip().endswith(".")
|
||||
|
||||
@staticmethod
|
||||
def _validate_language(text: str, valid_languages: List[str]) -> bool:
|
||||
"""
|
||||
Validate if the detected language of the text is one of the valid languages.
|
||||
|
||||
:param text: The text to validate.
|
||||
:param valid_languages: A list of valid languages.
|
||||
"""
|
||||
if not valid_languages:
|
||||
return True
|
||||
|
||||
try:
|
||||
lang = langdetect.detect(text)
|
||||
except langdetect.lang_detect_exception.LangDetectException:
|
||||
lang = None
|
||||
|
||||
return lang in valid_languages
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add new TextFileToDocument component to Haystack v2 preview so that text files can be converted to Haystack Documents.
|
||||
0
test/preview/components/file_converters/__init__.py
Normal file
0
test/preview/components/file_converters/__init__.py
Normal file
@ -0,0 +1,256 @@
|
||||
import logging
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from canals.errors import PipelineRuntimeError
|
||||
from langdetect import LangDetectException
|
||||
|
||||
from haystack.preview.components.file_converters.txt import TextFileToDocument
|
||||
from test.preview.components.base import BaseTestComponent
|
||||
from test.conftest import preview_samples_path
|
||||
|
||||
|
||||
class TestTextfileToDocument(BaseTestComponent):
|
||||
@pytest.mark.unit
|
||||
def test_run(self, preview_samples_path):
|
||||
"""
|
||||
Test if the component runs correctly.
|
||||
"""
|
||||
file_paths = [preview_samples_path / "txt" / "doc_1.txt", preview_samples_path / "txt" / "doc_2.txt"]
|
||||
converter = TextFileToDocument()
|
||||
output = converter.run(data=converter.input(paths=file_paths))
|
||||
docs = output.documents
|
||||
assert len(docs) == 2
|
||||
assert docs[0].content == "Some text for testing.\nTwo lines in here."
|
||||
assert docs[1].content == "This is a test line.\n123 456 789\n987 654 321."
|
||||
assert docs[0].meta["file_path"] == str(file_paths[0])
|
||||
assert docs[1].meta["file_path"] == str(file_paths[1])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_warning_for_invalid_language(self, preview_samples_path, caplog):
|
||||
file_path = preview_samples_path / "txt" / "doc_1.txt"
|
||||
converter = TextFileToDocument()
|
||||
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
|
||||
with caplog.at_level(logging.WARNING):
|
||||
output = converter.run(data=converter.input(paths=[file_path], valid_languages=["de"]))
|
||||
assert (
|
||||
f"Text from file {file_path} is not in one of the valid languages: ['de']. "
|
||||
f"The file may have been decoded incorrectly." in caplog.text
|
||||
)
|
||||
|
||||
docs = output.documents
|
||||
assert len(docs) == 1
|
||||
assert docs[0].content == "Some text for testing.\nTwo lines in here."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run_error_handling(self, preview_samples_path, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
file_paths = [preview_samples_path / "txt" / "doc_1.txt", "non_existing_file.txt"]
|
||||
converter = TextFileToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
output = converter.run(data=converter.input(paths=file_paths))
|
||||
assert (
|
||||
"Could not read file non_existing_file.txt. Skipping it. Error message: File at path non_existing_file.txt does not exist."
|
||||
in caplog.text
|
||||
)
|
||||
docs = output.documents
|
||||
assert len(docs) == 1
|
||||
assert docs[0].meta["file_path"] == str(file_paths[0])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_no_metadata(self):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when no custom metadata is provided.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
meta=None, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_single_dict(self):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when a single dict is provided.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
meta={"name": "test"}, file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")]
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
assert meta[0]["name"] == "test"
|
||||
assert meta[1]["name"] == "test"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_list_of_dicts(self):
|
||||
"""
|
||||
Test if the metadata is correctly prepared when a list of dicts is provided.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
meta = converter._prepare_metadata(
|
||||
meta=[{"name": "test1"}, {"name": "test2"}],
|
||||
file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt")],
|
||||
)
|
||||
assert len(meta) == 2
|
||||
assert meta[0]["file_path"] == "data/sample_path_1.txt"
|
||||
assert meta[1]["file_path"] == str(Path("data/sample_path_2.txt"))
|
||||
assert meta[0]["name"] == "test1"
|
||||
assert meta[1]["name"] == "test2"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_prepare_metadata_unmatching_list_len(self):
|
||||
"""
|
||||
Test if an error is raised when the number of metadata dicts is not equal to the number of
|
||||
file paths.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with pytest.raises(
|
||||
PipelineRuntimeError, match="The number of meta entries must match the number of paths if meta is a list."
|
||||
):
|
||||
converter._prepare_metadata(
|
||||
meta=[{"name": "test1"}, {"name": "test2"}],
|
||||
file_paths=["data/sample_path_1.txt", Path("data/sample_path_2.txt"), "data/sample_path_3.txt"],
|
||||
)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file(self, preview_samples_path):
|
||||
"""
|
||||
Test if the file is correctly read.
|
||||
"""
|
||||
file_path = preview_samples_path / "txt" / "doc_1.txt"
|
||||
converter = TextFileToDocument()
|
||||
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
|
||||
assert text == "Some text for testing.\nTwo lines in here."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file_non_existing_file(self):
|
||||
"""
|
||||
Test if an error is raised when the file does not exist.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
file_path = "non_existing_file.txt"
|
||||
with pytest.raises(PipelineRuntimeError, match=f"File at path {file_path} does not exist."):
|
||||
converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=False)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_read_and_clean_file_remove_numeric_tables(self, preview_samples_path):
|
||||
"""
|
||||
Test if the file is correctly read and numeric tables are removed.
|
||||
"""
|
||||
file_path = preview_samples_path / "txt" / "doc_2.txt"
|
||||
converter = TextFileToDocument()
|
||||
text = converter._read_and_clean_file(path=file_path, encoding="utf-8", remove_numeric_tables=True)
|
||||
assert text == "This is a test line.\n987 654 321."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_clean_page_without_remove_numeric_tables(self):
|
||||
"""
|
||||
Test if the page is not changed when remove_numeric_tables is False.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
page = "This is a test line.\n123 456 789"
|
||||
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=False)
|
||||
assert cleaned_page == page
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_clean_page_with_remove_numeric_tables(self):
|
||||
"""
|
||||
Test if the page is correctly cleaned when remove_numeric_tables is True.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
page = "This is a test line.\n123 456 789"
|
||||
cleaned_page = converter._clean_page(page=page, remove_numeric_tables=True)
|
||||
assert cleaned_page == "This is a test line."
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_numbers(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when it only contains numbers.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it only contains text.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "This is a test line."
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_only_numbers_with_period(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it only contains numbers and a period at
|
||||
the end.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789."
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_more_numbers_than_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when it consists of more than 40% of numbers than.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789 This is a test"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_less_numbers_than_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a non-numeric row when it consists of less than 40% of numbers than.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123 456 789 This is a test line"
|
||||
assert not converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_is_numeric_row_words_consist_of_numbers_and_text(self):
|
||||
"""
|
||||
Test if the line is correctly identified as a numeric row when the words consist of numbers and text.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
line = "123eur 456usd"
|
||||
assert converter._is_numeric_row(line=line)
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language(self):
|
||||
"""
|
||||
Test if the language is correctly validated.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with patch("haystack.preview.components.file_converters.txt.langdetect.detect", return_value="en"):
|
||||
assert converter._validate_language(text="This is an english text.", valid_languages=["en"])
|
||||
assert not converter._validate_language(text="This is an english text.", valid_languages=["de"])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language_no_languages_specified(self):
|
||||
"""
|
||||
Test if _validate_languages returns True when no languages are specified.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
assert converter._validate_language(text="This is an english test.", valid_languages=[])
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_validate_language_lang_detect_exception(self):
|
||||
"""
|
||||
Test if _validate_languages returns False when langdetect throws an exception.
|
||||
"""
|
||||
converter = TextFileToDocument()
|
||||
with patch(
|
||||
"haystack.preview.components.file_converters.txt.langdetect.detect",
|
||||
side_effect=LangDetectException(code=0, message="Test"),
|
||||
):
|
||||
assert not converter._validate_language(text="This is an english text.", valid_languages=["en"])
|
||||
2
test/preview/test_files/txt/doc_1.txt
Normal file
2
test/preview/test_files/txt/doc_1.txt
Normal file
@ -0,0 +1,2 @@
|
||||
Some text for testing.
|
||||
Two lines in here.
|
||||
3
test/preview/test_files/txt/doc_2.txt
Normal file
3
test/preview/test_files/txt/doc_2.txt
Normal file
@ -0,0 +1,3 @@
|
||||
This is a test line.
|
||||
123 456 789
|
||||
987 654 321.
|
||||
Loading…
x
Reference in New Issue
Block a user