haystack/test/nodes/test_preprocessor.py
Sara Zan 59608ca474
[CI Refactoring] Workflow refactoring (#2576)
* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-07 09:23:03 +02:00

116 lines
4.4 KiB
Python

import sys
from pathlib import Path
import pytest
from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor
from ..conftest import SAMPLES_PATH
TEXT = """
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation like Dr.
in the sentence.
"""
def test_preprocess_sentence_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 15
preprocessor = PreProcessor(
split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 2
def test_preprocess_word_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 11
preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
documents = preprocessor.process(document)
for i, doc in enumerate(documents):
if i == 0:
assert len(doc.content.split(" ")) == 14
assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
assert len(documents) == 8
preprocessor = PreProcessor(
split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
)
documents = preprocessor.process(document)
assert len(documents) == 5
preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
documents = preprocessor.process(document)
assert len(documents) == 15
def test_preprocess_passage_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 3
preprocessor = PreProcessor(
split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 2
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
def test_clean_header_footer():
converter = PDFToTextConverter()
document = converter.convert(
file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
) # file contains header/footer
preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
documents = preprocessor.process(document)
assert len(documents) == 1
assert "This is a header." not in documents[0].content
assert "footer" not in documents[0].content
def test_remove_substrings():
document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
# check that the file contains the substrings we are about to remove
assert "This is a header." in document.content
assert "wiki" in document.content
assert "🪲" in document.content
assert "whitespace" in document.content
assert "" in document.content
preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
documents = preprocessor.process(document)
assert "This is a header." not in documents[0].content
assert "wiki" not in documents[0].content
assert "🪲" not in documents[0].content
assert "whitespace" in documents[0].content
assert "" in documents[0].content