2020-10-30 18:06:02 +01:00
|
|
|
import pytest
|
|
|
|
|
2021-10-25 15:50:23 +02:00
|
|
|
from haystack.preprocessor.utils import convert_files_to_dicts, tika_convert_files_to_dicts
|
2020-10-01 14:47:45 +02:00
|
|
|
from haystack.preprocessor.cleaning import clean_wiki_text
|
|
|
|
|
|
|
|
|
2020-10-30 18:06:02 +01:00
|
|
|
@pytest.mark.tika
|
2020-10-01 14:47:45 +02:00
|
|
|
def test_convert_files_to_dicts(xpdf_fixture):
|
2021-10-25 15:50:23 +02:00
|
|
|
documents = convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
|
2020-10-01 14:47:45 +02:00
|
|
|
assert documents and len(documents) > 0
|
|
|
|
|
|
|
|
|
2020-10-30 18:06:02 +01:00
|
|
|
@pytest.mark.tika
|
2020-10-01 14:47:45 +02:00
|
|
|
def test_tika_convert_files_to_dicts(tika_fixture):
|
2021-10-25 15:50:23 +02:00
|
|
|
documents = tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
|
2020-10-01 14:47:45 +02:00
|
|
|
assert documents and len(documents) > 0
|
|
|
|
|