| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | from typing import Dict, Any | 
					
						
							|  |  |  | from pathlib import Path | 
					
						
							|  |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2024-01-11 12:04:25 +01:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | from haystack import Pipeline | 
					
						
							| 
									
										
										
										
											2024-01-11 12:04:25 +01:00
										 |  |  | from haystack.dataclasses import ByteStream | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | from haystack.components.others import Multiplexer | 
					
						
							|  |  |  | from haystack.components.converters import PyPDFToDocument, TextFileToDocument | 
					
						
							|  |  |  | from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter | 
					
						
							| 
									
										
										
										
											2024-01-10 21:20:42 +01:00
										 |  |  | from haystack.components.routers import FileTypeRouter | 
					
						
							|  |  |  | from haystack.components.joiners import DocumentJoiner | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | from haystack.components.writers import DocumentWriter | 
					
						
							| 
									
										
										
										
											2024-01-10 21:20:42 +01:00
										 |  |  | from haystack.document_stores.in_memory import InMemoryDocumentStore | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | document_store = InMemoryDocumentStore() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | p = Pipeline() | 
					
						
							|  |  |  | p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router") | 
					
						
							|  |  |  | p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer") | 
					
						
							|  |  |  | p.add_component(instance=TextFileToDocument(), name="text_file_converter") | 
					
						
							|  |  |  | p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter") | 
					
						
							|  |  |  | p.add_component(instance=DocumentJoiner(), name="joiner") | 
					
						
							|  |  |  | p.add_component(instance=DocumentCleaner(), name="cleaner") | 
					
						
							|  |  |  | p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter") | 
					
						
							|  |  |  | p.add_component(instance=DocumentWriter(document_store=document_store), name="writer") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | p.connect("file_type_router.text/plain", "text_file_converter.sources") | 
					
						
							|  |  |  | p.connect("file_type_router.application/pdf", "pdf_file_converter.sources") | 
					
						
							|  |  |  | p.connect("metadata_multiplexer", "text_file_converter.meta") | 
					
						
							|  |  |  | p.connect("metadata_multiplexer", "pdf_file_converter.meta") | 
					
						
							|  |  |  | p.connect("text_file_converter.documents", "joiner.documents") | 
					
						
							|  |  |  | p.connect("pdf_file_converter.documents", "joiner.documents") | 
					
						
							|  |  |  | p.connect("joiner.documents", "cleaner.documents") | 
					
						
							|  |  |  | p.connect("cleaner.documents", "splitter.documents") | 
					
						
							|  |  |  | p.connect("splitter.documents", "writer.documents") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-11 12:04:25 +01:00
										 |  |  | # Add metadata to your files by using ByteStream | 
					
						
							|  |  |  | sources = [] | 
					
						
							|  |  |  | for position, path in enumerate(list(Path(".").iterdir())): | 
					
						
							|  |  |  |     if path.is_file(): | 
					
						
							|  |  |  |         # Create the ByteStream | 
					
						
							|  |  |  |         source = ByteStream.from_file_path(path) | 
					
						
							|  |  |  |         # Add the metadata | 
					
						
							|  |  |  |         source.meta["path"] = path | 
					
						
							|  |  |  |         source.meta["position"] = position | 
					
						
							|  |  |  |         sources.append(source) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  | result = p.run( | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2024-01-11 12:04:25 +01:00
										 |  |  |         "file_type_router": {"sources": sources}, | 
					
						
							| 
									
										
										
										
											2024-01-09 14:59:22 +01:00
										 |  |  |         "metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}}, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | assert all("date_added" in doc.meta for doc in document_store.filter_documents()) |