mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-11-04 11:49:23 +00:00 
			
		
		
		
	* Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
		
			
				
	
	
		
			94 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			94 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from pathlib import Path
 | 
						|
 | 
						|
from haystack.nodes.file_converter.pdf import PDFToTextConverter
 | 
						|
from haystack.nodes.preprocessor.preprocessor import PreProcessor
 | 
						|
 | 
						|
from conftest import SAMPLES_PATH
 | 
						|
 | 
						|
TEXT = """
 | 
						|
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in 
 | 
						|
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
 | 
						|
 | 
						|
This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in 
 | 
						|
paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
 | 
						|
 | 
						|
This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in 
 | 
						|
paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation like Dr. 
 | 
						|
in the sentence. 
 | 
						|
"""
 | 
						|
 | 
						|
 | 
						|
def test_preprocess_sentence_split():
 | 
						|
    document = {"content": TEXT}
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 15
 | 
						|
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=10,
 | 
						|
        split_overlap=0,
 | 
						|
        split_by="sentence",
 | 
						|
        split_respect_sentence_boundary=False,
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 2
 | 
						|
 | 
						|
 | 
						|
def test_preprocess_word_split():
 | 
						|
    document = {"content": TEXT}
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 11
 | 
						|
 | 
						|
    preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    for i, doc in enumerate(documents):
 | 
						|
        if i == 0:
 | 
						|
            assert len(doc["content"].split(" ")) == 14
 | 
						|
        assert len(doc["content"].split(" ")) <= 15 or doc["content"].startswith("This is to trick")
 | 
						|
    assert len(documents) == 8
 | 
						|
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 5
 | 
						|
 | 
						|
    preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 15
 | 
						|
 | 
						|
 | 
						|
def test_preprocess_passage_split():
 | 
						|
    document = {"content": TEXT}
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 3
 | 
						|
 | 
						|
    preprocessor = PreProcessor(
 | 
						|
        split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
 | 
						|
    )
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
    assert len(documents) == 2
 | 
						|
 | 
						|
 | 
						|
def test_clean_header_footer():
 | 
						|
    converter = PDFToTextConverter()
 | 
						|
    document = converter.convert(
 | 
						|
        file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
 | 
						|
    )  # file contains header/footer
 | 
						|
 | 
						|
    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
 | 
						|
    documents = preprocessor.process(document)
 | 
						|
 | 
						|
    assert len(documents) == 1
 | 
						|
 | 
						|
    assert "This is a header." not in documents[0]["content"]
 | 
						|
    assert "footer" not in documents[0]["content"]
 |