2024-01-09 14:59:22 +01:00
|
|
|
from typing import Dict, Any
|
|
|
|
from pathlib import Path
|
|
|
|
from datetime import datetime
|
2024-01-11 12:04:25 +01:00
|
|
|
import os
|
2024-01-09 14:59:22 +01:00
|
|
|
|
|
|
|
from haystack import Pipeline
|
2024-01-11 12:04:25 +01:00
|
|
|
from haystack.dataclasses import ByteStream
|
2024-01-09 14:59:22 +01:00
|
|
|
from haystack.components.others import Multiplexer
|
|
|
|
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
|
|
|
|
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
|
2024-01-10 21:20:42 +01:00
|
|
|
from haystack.components.routers import FileTypeRouter
|
|
|
|
from haystack.components.joiners import DocumentJoiner
|
2024-01-09 14:59:22 +01:00
|
|
|
from haystack.components.writers import DocumentWriter
|
2024-01-10 21:20:42 +01:00
|
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
2024-01-09 14:59:22 +01:00
|
|
|
|
|
|
|
|
|
|
|
document_store = InMemoryDocumentStore()
|
|
|
|
|
|
|
|
p = Pipeline()
|
|
|
|
p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router")
|
|
|
|
p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
|
|
|
|
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
|
|
|
|
p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
|
|
|
|
p.add_component(instance=DocumentJoiner(), name="joiner")
|
|
|
|
p.add_component(instance=DocumentCleaner(), name="cleaner")
|
|
|
|
p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
|
|
|
|
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
|
|
|
|
|
|
|
|
p.connect("file_type_router.text/plain", "text_file_converter.sources")
|
|
|
|
p.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
|
|
|
|
p.connect("metadata_multiplexer", "text_file_converter.meta")
|
|
|
|
p.connect("metadata_multiplexer", "pdf_file_converter.meta")
|
|
|
|
p.connect("text_file_converter.documents", "joiner.documents")
|
|
|
|
p.connect("pdf_file_converter.documents", "joiner.documents")
|
|
|
|
p.connect("joiner.documents", "cleaner.documents")
|
|
|
|
p.connect("cleaner.documents", "splitter.documents")
|
|
|
|
p.connect("splitter.documents", "writer.documents")
|
|
|
|
|
2024-01-11 12:04:25 +01:00
|
|
|
# Add metadata to your files by using ByteStream
|
|
|
|
sources = []
|
|
|
|
for position, path in enumerate(list(Path(".").iterdir())):
|
|
|
|
if path.is_file():
|
|
|
|
# Create the ByteStream
|
|
|
|
source = ByteStream.from_file_path(path)
|
|
|
|
# Add the metadata
|
|
|
|
source.meta["path"] = path
|
|
|
|
source.meta["position"] = position
|
|
|
|
sources.append(source)
|
|
|
|
|
2024-01-09 14:59:22 +01:00
|
|
|
result = p.run(
|
|
|
|
{
|
2024-01-11 12:04:25 +01:00
|
|
|
"file_type_router": {"sources": sources},
|
2024-01-09 14:59:22 +01:00
|
|
|
"metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}},
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
assert all("date_added" in doc.meta for doc in document_store.filter_documents())
|