haystack/examples/pipelines/indexing_pipeline_with_meta.py
2024-01-11 12:04:25 +01:00

58 lines
2.4 KiB
Python

from typing import Dict, Any
from pathlib import Path
from datetime import datetime
import os
from haystack import Pipeline
from haystack.dataclasses import ByteStream
from haystack.components.others import Multiplexer
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
p = Pipeline()
p.add_component(instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router")
p.add_component(instance=Multiplexer(Dict[str, Any]), name="metadata_multiplexer")
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
p.add_component(instance=DocumentJoiner(), name="joiner")
p.add_component(instance=DocumentCleaner(), name="cleaner")
p.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
p.connect("file_type_router.text/plain", "text_file_converter.sources")
p.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
p.connect("metadata_multiplexer", "text_file_converter.meta")
p.connect("metadata_multiplexer", "pdf_file_converter.meta")
p.connect("text_file_converter.documents", "joiner.documents")
p.connect("pdf_file_converter.documents", "joiner.documents")
p.connect("joiner.documents", "cleaner.documents")
p.connect("cleaner.documents", "splitter.documents")
p.connect("splitter.documents", "writer.documents")
# Add metadata to your files by using ByteStream
sources = []
for position, path in enumerate(list(Path(".").iterdir())):
if path.is_file():
# Create the ByteStream
source = ByteStream.from_file_path(path)
# Add the metadata
source.meta["path"] = path
source.meta["position"] = position
sources.append(source)
result = p.run(
{
"file_type_router": {"sources": sources},
"metadata_multiplexer": {"value": {"date_added": datetime.now().isoformat()}},
}
)
assert all("date_added" in doc.meta for doc in document_store.filter_documents())