mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 01:39:45 +00:00 
			
		
		
		
	 834f8c4902
			
		
	
	
		834f8c4902
		
			
		
	
	
	
	
		
			
			* Change return types of file converters * Change return types of preprocessor * Change return types of crawler * Adapt utils to functions to new return types * Adapt __init__.py to new method names * Prevent circular imports * Update Documentation & Code Style * Let DocStores' run method accept Documents * Adapt tests to new return types * Update Documentation & Code Style * Put "# type: ignore" to right place * Remove id_hash_keys property from Document primitive * Update Documentation & Code Style * Adapt tests to new return types and missing id_hash_keys property * Fix mypy * Fix mypy * Adapt PDFToTextOCRConverter * Remove id_hash_keys from RestAPI tests * Update Documentation & Code Style * Rename tests * Remove redundant setting of content_type="text" * Add DeprecationWarning * Add id_hash_keys to elasticsearch_index_to_document_store * Change document type from dict to Docuemnt in PreProcessor test * Fix file path in Tutorial 5 * Remove added output in Tutorial 5 * Update Documentation & Code Style * Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http * Adapt tutorials to new return types * Adapt tutorial 14 to new return types * Update Documentation & Code Style * Change assertions to HaystackErrors * Import HaystackError correctly Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
		
			
				
	
	
		
			112 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			112 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from pathlib import Path
 | |
| 
 | |
| from haystack import Document
 | |
| from haystack.nodes.file_converter.pdf import PDFToTextConverter
 | |
| from haystack.nodes.preprocessor.preprocessor import PreProcessor
 | |
| 
 | |
| from .conftest import SAMPLES_PATH
 | |
| 
 | |
| TEXT = """
 | |
| This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in 
 | |
| paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
 | |
| 
 | |
| This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in 
 | |
| paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
 | |
| 
 | |
| This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in 
 | |
| paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation like Dr. 
 | |
| in the sentence. 
 | |
| """
 | |
| 
 | |
| 
 | |
| def test_preprocess_sentence_split():
 | |
|     document = Document(content=TEXT)
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 15
 | |
| 
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 2
 | |
| 
 | |
| 
 | |
| def test_preprocess_word_split():
 | |
|     document = Document(content=TEXT)
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 11
 | |
| 
 | |
|     preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 | |
|     documents = preprocessor.process(document)
 | |
|     for i, doc in enumerate(documents):
 | |
|         if i == 0:
 | |
|             assert len(doc.content.split(" ")) == 14
 | |
|         assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
 | |
|     assert len(documents) == 8
 | |
| 
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 5
 | |
| 
 | |
|     preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 15
 | |
| 
 | |
| 
 | |
| def test_preprocess_passage_split():
 | |
|     document = Document(content=TEXT)
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 3
 | |
| 
 | |
|     preprocessor = PreProcessor(
 | |
|         split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
 | |
|     )
 | |
|     documents = preprocessor.process(document)
 | |
|     assert len(documents) == 2
 | |
| 
 | |
| 
 | |
| def test_clean_header_footer():
 | |
|     converter = PDFToTextConverter()
 | |
|     document = converter.convert(
 | |
|         file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
 | |
|     )  # file contains header/footer
 | |
| 
 | |
|     preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
 | |
|     documents = preprocessor.process(document)
 | |
| 
 | |
|     assert len(documents) == 1
 | |
| 
 | |
|     assert "This is a header." not in documents[0].content
 | |
|     assert "footer" not in documents[0].content
 | |
| 
 | |
| 
 | |
| def test_remove_substrings():
 | |
|     document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
 | |
| 
 | |
|     # check that the file contains the substrings we are about to remove
 | |
|     assert "This is a header." in document.content
 | |
|     assert "wiki" in document.content
 | |
|     assert "🪲" in document.content
 | |
|     assert "whitespace" in document.content
 | |
|     assert "✨" in document.content
 | |
| 
 | |
|     preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
 | |
|     documents = preprocessor.process(document)
 | |
| 
 | |
|     assert "This is a header." not in documents[0].content
 | |
|     assert "wiki" not in documents[0].content
 | |
|     assert "🪲" not in documents[0].content
 | |
|     assert "whitespace" in documents[0].content
 | |
|     assert "✨" in documents[0].content
 |