Remove pipeline_utils package and dependent code (#6806)

This commit is contained in:
Vladimir Blagojevic 2024-01-23 18:40:43 +01:00 committed by GitHub
parent 4efe40664c
commit c47b82c54f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 0 additions and 604 deletions

View File

@ -1,22 +0,0 @@
import os
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.pipeline_utils import build_rag_pipeline
API_KEY = "SET YOUR OPENAI API KEY HERE"
# We support many different databases. Here we load a simple and lightweight in-memory document store.
document_store = InMemoryDocumentStore()
# Create some example documents and add them to the document store.
documents = [
Document(content="My name is Jean and I live in Paris."),
Document(content="My name is Mark and I live in Berlin."),
Document(content="My name is Giorgio and I live in Rome."),
]
document_store.write_documents(documents)
# Let's now build a simple RAG pipeline that uses a generative model to answer questions.
rag_pipeline = build_rag_pipeline(llm_api_key=API_KEY, document_store=document_store)
answers = rag_pipeline.run(query="Who lives in Rome?")
print(answers.data)

View File

@ -1,34 +0,0 @@
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.pipeline_utils import build_rag_pipeline, build_indexing_pipeline
from haystack.pipeline_utils.indexing import download_files
# We are model agnostic :) In this getting started you can choose any OpenAI or Huggingface TGI generation model
generation_model = "gpt-3.5-turbo"
API_KEY = "sk-..." # ADD YOUR KEY HERE
# We support many different databases. Here, we load a simple and lightweight in-memory database.
document_store = InMemoryDocumentStore()
# Download example files from web
files = download_files(sources=["http://www.paulgraham.com/superlinear.html"])
# Pipelines are our main abstratcion.
# Here we create a pipeline that can index TXT and HTML. You can also use your own private files.
indexing_pipeline = build_indexing_pipeline(
document_store=document_store,
embedding_model="intfloat/e5-base-v2",
supported_mime_types=["text/plain", "text/html"], # "application/pdf"
)
indexing_pipeline.run(files=files) # you can also supply files=[path_to_directory], which is searched recursively
# RAG pipeline with vector-based retriever + LLM
rag_pipeline = build_rag_pipeline(
document_store=document_store,
embedding_model="intfloat/e5-base-v2",
generation_model=generation_model,
llm_api_key=API_KEY,
)
# For details, like which documents were used to generate the answer, look into the result object
result = rag_pipeline.run(query="What are superlinear returns and why are they important?")
print(result.data)

View File

@ -1,4 +0,0 @@
from haystack.pipeline_utils.rag import build_rag_pipeline
from haystack.pipeline_utils.indexing import build_indexing_pipeline
__all__ = ["build_rag_pipeline", "build_indexing_pipeline"]

View File

@ -1,235 +0,0 @@
import os
import re
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Optional, List, Any, Dict
from typing import Union, Type
from haystack import Pipeline
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, OpenAIDocumentEmbedder
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DocumentStore
def download_files(sources: List[str]) -> List[str]:
"""
Downloads a list of files from the web and returns a list of their paths where they are stored locally.
:param sources: A list of URLs to download.
:type sources: List[str]
:return: A list of paths to the downloaded files.
"""
fetcher = LinkContentFetcher(user_agents=["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"])
streams = fetcher.run(urls=sources)
all_files = []
for stream in streams["streams"]:
file_suffix = ".html" if stream.meta["content_type"] == "text/html" else ".pdf"
f = NamedTemporaryFile(delete=False, suffix=file_suffix)
stream.to_file(Path(f.name))
all_files.append(f.name)
return all_files
def build_indexing_pipeline(
document_store: Any,
embedding_model: str = "intfloat/e5-base-v2",
embedding_model_kwargs: Optional[Dict[str, Any]] = None,
supported_mime_types: Optional[List[str]] = None,
):
"""
Returns a prebuilt pipeline for indexing documents into a DocumentStore. Indexing pipeline automatically detects
the file type of the input files and converts them into Documents. The supported file types are: .txt,
.pdf, and .html but by default only .txt and .html are indexed unless the supported_mime_types parameter is set.
Example usage:
```python
from haystack.utils import build_indexing_pipeline
indexing_pipe = build_indexing_pipeline(document_store=your_document_store_instance)
indexing_pipe.run(files=["path/to/file1", "path/to/file2"])
>>> {'documents_written': 2}
```
One can also pass an embedding model to the pipeline, which will then calculate embeddings for the documents
and store them in the DocumentStore. Example usage:
```python
indexing_pipe = build_indexing_pipeline(document_store=your_document_store_instance,
embedding_model="sentence-transformers/all-mpnet-base-v2")
indexing_pipe.run(files=["path/to/file1", "path/to/file2"])
>>> {'documents_written': 2}
```
After running indexing pipeline, the documents are indexed in the DocumentStore and can be used for querying.
:param document_store: An instance of a DocumentStore to index documents into.
:param embedding_model: The name of the model to use for document embeddings.
Needs top be compatible with SentenceTransformers
:param embedding_model_kwargs: Keyword arguments to pass to the embedding model class.
:param supported_mime_types: List of MIME types to support in the pipeline. If not given,
defaults to ["text/plain", "text/html"].
"""
return _IndexingPipeline(
document_store=document_store,
embedding_model=embedding_model,
embedding_model_kwargs=embedding_model_kwargs,
supported_mime_types=supported_mime_types,
)
class _IndexingPipeline:
"""
An internal class to simplify creation of prebuilt pipeline for indexing documents into a DocumentStore. Indexing
pipeline automatically detect the file type of the input files and converts them into Documents. The supported
file types are: .txt, .pdf, and .html
"""
def __init__(
self,
document_store: DocumentStore,
embedding_model: Optional[str] = None,
embedding_model_kwargs: Optional[Dict[str, Any]] = None,
supported_mime_types: Optional[List[str]] = None,
):
"""
:param document_store: An instance of a DocumentStore to index documents into.
:param embedding_model: The name of the model to use for document embeddings.
:param supported_mime_types: List of MIME types to support in the pipeline. If not given,
defaults to ["text/plain", "application/pdf", "text/html"].
"""
if supported_mime_types is None:
supported_mime_types = ["text/plain", "text/html"]
self.pipeline = Pipeline()
self.pipeline.add_component("file_type_router", FileTypeRouter(mime_types=supported_mime_types))
converters_used: List[str] = []
# Add converters dynamically based on MIME types
if "text/plain" in supported_mime_types:
self.pipeline.add_component("text_file_converter", TextFileToDocument())
self.pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
converters_used.append("text_file_converter")
if "application/pdf" in supported_mime_types:
from haystack.components.converters import PyPDFToDocument
self.pipeline.add_component("pdf_file_converter", PyPDFToDocument())
self.pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
converters_used.append("pdf_file_converter")
if "text/html" in supported_mime_types:
from haystack.components.converters import HTMLToDocument
self.pipeline.add_component("html_file_converter", HTMLToDocument())
self.pipeline.connect("file_type_router.text/html", "html_file_converter.sources")
converters_used.append("html_file_converter")
# Add remaining common components
self.pipeline.add_component("document_joiner", DocumentJoiner())
self.pipeline.add_component("document_cleaner", DocumentCleaner())
self.pipeline.add_component("document_splitter", DocumentSplitter())
# Connect converters to joiner, if they exist
for converter_name in converters_used:
self.pipeline.connect(f"{converter_name}.documents", "document_joiner.documents")
# Connect joiner to cleaner and splitter
self.pipeline.connect("document_joiner.documents", "document_cleaner.documents")
self.pipeline.connect("document_cleaner.documents", "document_splitter.documents")
if embedding_model:
embedder_instance = self._find_embedder(embedding_model, embedding_model_kwargs)
self.pipeline.add_component("storage_sink", DocumentWriter(document_store=document_store))
self.pipeline.add_component("writer", embedder_instance)
self.pipeline.connect("writer", "storage_sink")
else:
self.pipeline.add_component("writer", DocumentWriter(document_store=document_store))
self.pipeline.connect("document_splitter.documents", "writer.documents")
# this is more of a sanity check for the maintainer of the pipeline, to make sure that the pipeline is
# configured correctly
if len(self.pipeline.inputs()) < 1:
raise RuntimeError("IndexingPipeline needs at least one input component.")
if len(self.pipeline.outputs()) < 1:
raise RuntimeError("IndexingPipeline needs at least one output component.")
def run(self, files: List[Union[str, Path]]) -> Dict[str, Any]:
"""
Performs indexing of the given list of documents into the DocumentStore.
:param files: A list of paths to files to index.
:type files: List[Union[str, Path]]
:return: the output of the pipeline run, which is a dictionary containing the number of documents written
"""
if not files:
return {"documents_written": 0}
input_files = self._process_files(files)
pipeline_output = self.pipeline.run(data={"file_type_router": {"sources": input_files}})
aggregated_results = {}
# combine the results of all outputs into one dictionary
for component_result in pipeline_output.values():
aggregated_results.update(component_result)
return aggregated_results
def _find_embedder(self, embedding_model: str, init_kwargs: Optional[Dict[str, Any]] = None) -> Any:
embedder_patterns = {
r"^text-embedding.*": OpenAIDocumentEmbedder,
r"^sentence-transformers.*": SentenceTransformersDocumentEmbedder,
r"^intfloat/.*": SentenceTransformersDocumentEmbedder,
# add more patterns or adjust them here
}
embedder_class = next((val for pat, val in embedder_patterns.items() if re.match(pat, embedding_model)), None)
if not embedder_class:
raise ValueError(
f"Could not find an embedder for the given embedding model name {embedding_model}. "
f"Please provide a valid embedding model name. "
f"Valid embedder classes are {embedder_patterns.values()}."
)
return self._create_embedder(embedder_class, embedding_model, init_kwargs)
def _create_embedder(self, embedder_class: Type, model: str, init_kwargs: Optional[Dict[str, Any]] = None) -> Any:
# Note: here we assume the embedder accepts a parameter called `model` that takes the model's name or path.
# See https://github.com/deepset-ai/haystack/issues/6534
kwargs = {**(init_kwargs or {}), "model": model}
return embedder_class(**kwargs)
def _list_files_recursively(self, path: Union[str, Path]) -> List[str]:
"""
List all files in a directory recursively as a list of strings, or return the file itself
if it's not a directory.
:param path: the path to list files from
:type path: Union[str, Path]
:return: a list of strings, where each string is a path to a file
"""
if os.path.isfile(path):
return [str(path)]
elif os.path.isdir(path):
file_list: List[str] = []
for root, _, files in os.walk(path):
for file in files:
file_list.append(os.path.join(root, file))
return file_list
else:
return []
def _process_files(self, files: List[Union[str, Path]]) -> List[str]:
"""
Process a list of files and directories, listing all files recursively and removing duplicates.
:param files: A list of files and directories to process.
:type files: List[Union[str, Path]]
:return: A list of unique files.
"""
nested_file_lists = [self._list_files_recursively(file) for file in files]
combined_files = [item for sublist in nested_file_lists for item in sublist]
unique_files = list(set(combined_files))
return unique_files

View File

@ -1,204 +0,0 @@
import re
from abc import ABC, abstractmethod
from typing import Optional, Any
from huggingface_hub import HfApi
from haystack import Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator, HuggingFaceTGIGenerator
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.dataclasses import Answer
from haystack.document_stores.types import DocumentStore
from haystack.document_stores.in_memory import InMemoryDocumentStore
def build_rag_pipeline(
document_store: DocumentStore,
embedding_model: str = "intfloat/e5-base-v2",
generation_model: str = "gpt-3.5-turbo",
llm_api_key: Optional[str] = None,
prompt_template: Optional[str] = None,
):
"""
Returns a prebuilt pipeline to perform retrieval augmented generation
:param document_store: An instance of a DocumentStore to read from.
:param embedding_model: The name of the model to use for embedding. Only SentenceTransformer models supported in this getting started code.
:param prompt_template: The template to use for the prompt. If not given, a default RAG template is used.
:param generation_model: The name of the model to use for generation.
Currently supporting: OpenAI generation models and Huggingface TGI models for text generation
:param llm_api_key: The API key to use for the OpenAI Language Model. If not given, the value of the
llm_api_key will be attempted to be read from the environment variable OPENAI_API_KEY.
"""
# Resolve components based on the chosen parameters
retriever = resolve_retriever(document_store)
embedder = resolve_embedder(embedding_model)
generator = resolve_generator(generation_model, llm_api_key)
prompt_template = resolve_prompt_template(prompt_template)
# Add them to the Pipeline and connect them
pipeline = _RAGPipeline(
retriever=retriever, embedder=embedder, generator=generator, prompt_template=prompt_template
)
return pipeline
class _RAGPipeline:
"""
A simple ready-made pipeline for RAG. It requires a populated document store.
"""
def __init__(self, retriever: Any, embedder: Any, generator: Any, prompt_template: str):
"""
Initializes the pipeline.
:param retriever: The retriever to use.
:param embedder: The embedder to use.
:param generator: The generator to use.
:param prompt_template: The template to use for the prompt.
"""
self.pipeline = Pipeline()
self.pipeline.add_component(instance=embedder, name="text_embedder")
self.pipeline.add_component(instance=retriever, name="retriever")
self.pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
self.pipeline.add_component(instance=generator, name="llm")
self.pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
self.pipeline.connect("text_embedder", "retriever")
self.pipeline.connect("retriever", "prompt_builder.documents")
self.pipeline.connect("prompt_builder.prompt", "llm.prompt")
self.pipeline.connect("llm.replies", "answer_builder.replies")
self.pipeline.connect("llm.meta", "answer_builder.meta")
self.pipeline.connect("retriever", "answer_builder.documents")
def run(self, query: str) -> Answer:
"""
Performs RAG using the given query.
:param query: The query to ask.
:return: An Answer object.
"""
run_values = {
"prompt_builder": {"question": query},
"answer_builder": {"query": query},
"text_embedder": {"text": query},
}
return self.pipeline.run(run_values)["answer_builder"]["answers"][0]
def resolve_embedder(embedding_model: str) -> SentenceTransformersTextEmbedder:
"""
Resolves the embedder
:param embedding_model: The embedding model to use.
"""
try:
embedder = SentenceTransformersTextEmbedder(model=embedding_model)
except Exception:
raise ValueError(
f"Embedding model: {embedding_model} is not supported. Please provide a SentenceTransformers model."
f"You can download the models through the huggingface model hub here: https://huggingface.co/sentence-transformers"
)
return embedder
def resolve_retriever(document_store, retriever_class: Optional[str] = None) -> Optional[Any]:
"""
Resolves the retriever class to use for the given document store.
:param document_store: The document store to use.
:param retriever_class: The retriever class to use. If not given, it will be inferred from the document store.
"""
# first match the document store to the retriever
# TODO: add more retrievers
embedding_retriever_map = {InMemoryDocumentStore: InMemoryEmbeddingRetriever}
retriever_clazz = (
retriever_class or embedding_retriever_map[type(document_store)]
if type(document_store) in embedding_retriever_map
else None
)
if not retriever_clazz:
raise ValueError(
f"Document store {type(document_store)} is not supported. Please provide a retriever class or use "
f"one of the following document stores: {list(embedding_retriever_map.keys())}"
)
retriever = retriever_clazz(document_store=document_store) # type: ignore
return retriever
def resolve_generator(generation_model: str, llm_api_key: Optional[str] = None) -> Optional[Any]:
"""
Resolves the generator to use for the given generation model.
:param generation_model: The generation model to use.
:param llm_api_key: The API key to use for the language model.
"""
generator = None
for resolver_clazz in _GeneratorResolver.get_resolvers():
resolver = resolver_clazz()
generator = resolver.resolve(generation_model, llm_api_key)
if generator:
break
if not generator:
raise ValueError(f"Could not resolve LLM generator for the given model {generation_model}")
return generator
def resolve_prompt_template(prompt_template: Optional[str]) -> str:
prompt_template = (
prompt_template
or """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
)
return prompt_template
class _GeneratorResolver(ABC):
_resolvers = [] # type: ignore
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
_GeneratorResolver._resolvers.append(cls)
@abstractmethod
def resolve(self, model_key: str, api_key: str) -> Any:
pass
@classmethod
def get_resolvers(cls):
return cls._resolvers
class _OpenAIResolved(_GeneratorResolver):
"""
Resolves the OpenAIGenerator.
"""
def resolve(self, model_key: str, api_key: str) -> Any:
# does the model_key match the pattern OpenAI GPT pattern?
if re.match(r"^gpt-4-.*", model_key) or re.match(r"^gpt-3.5-.*", model_key):
return OpenAIGenerator(model=model_key, api_key=api_key)
return None
class _HuggingFaceTGIGeneratorResolved(_GeneratorResolver):
"""
Resolves the HuggingFaceTGIGenerator.
"""
def resolve(self, model_key: str, api_key: str) -> Any:
hf = HfApi()
try:
hf.model_info(model_key)
return HuggingFaceTGIGenerator(model=model_key, token=api_key, generation_kwargs={"max_new_tokens": 1024})
except Exception:
return None

View File

@ -1,76 +0,0 @@
import os
import pytest
from haystack.pipeline_utils.indexing import build_indexing_pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
class TestIndexingPipeline:
# indexing files without embeddings
@pytest.mark.integration
def test_indexing_files_without_embeddings(self, test_files_path):
file_paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "txt" / "doc_2.txt"]
document_store = InMemoryDocumentStore()
pipeline = build_indexing_pipeline(document_store=document_store)
result = pipeline.run(files=file_paths)
assert result == {"documents_written": 2}
# indexing files with embeddings
@pytest.mark.integration
def test_indexing_files_with_embeddings(self, test_files_path):
document_store = InMemoryDocumentStore()
pipeline = build_indexing_pipeline(
document_store=document_store, embedding_model="sentence-transformers/all-mpnet-base-v2"
)
file_paths = [test_files_path / "txt" / "doc_1.txt", test_files_path / "txt" / "doc_2.txt"]
result = pipeline.run(files=file_paths)
assert result == {"documents_written": 2}
@pytest.mark.integration
def test_indexing_dirs_with_embeddings(self, test_files_path):
document_store = InMemoryDocumentStore()
pipeline = build_indexing_pipeline(
document_store=document_store, embedding_model="sentence-transformers/all-mpnet-base-v2"
)
file_paths = [test_files_path / "txt"]
result = pipeline.run(files=file_paths)
assert "documents_written" in result
assert result["documents_written"] >= 3
# indexing multiple files
@pytest.mark.integration
def test_indexing_multiple_file_types(self, test_files_path):
document_store = InMemoryDocumentStore()
pipeline = build_indexing_pipeline(
document_store=document_store, supported_mime_types=["text/plain", "application/pdf"]
)
file_paths = [
test_files_path / "txt" / "doc_1.txt",
test_files_path / "txt" / "doc_2.txt",
test_files_path / "pdf" / "sample_pdf_1.pdf",
]
result = pipeline.run(files=file_paths)
# pdf gets split into 2 documents
assert result == {"documents_written": 4}
# indexing empty list of files
def test_indexing_empty_list_of_files(self):
document_store = InMemoryDocumentStore()
pipeline = build_indexing_pipeline(document_store=document_store)
result = pipeline.run(files=[])
assert result == {"documents_written": 0}
# embedding model is not found
def test_embedding_model_not_found(self):
document_store = InMemoryDocumentStore()
with pytest.raises(ValueError, match="Could not find an embedder"):
build_indexing_pipeline(document_store=document_store, embedding_model="invalid_model")
@pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
@pytest.mark.integration
def test_open_ai_embedding_model(self):
document_store = InMemoryDocumentStore()
pipe = build_indexing_pipeline(document_store=document_store, embedding_model="text-embedding-ada-002")
# don't run the pipeline and waste credits, just check that it was created correctly
assert pipe is not None

View File

@ -1,29 +0,0 @@
import os
import pytest
from haystack.dataclasses import Answer
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.pipeline_utils.rag import build_rag_pipeline
from haystack.testing.factory import document_store_class
@pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
@pytest.mark.integration
def test_rag_pipeline(mock_chat_completion):
rag_pipe = build_rag_pipeline(document_store=InMemoryDocumentStore())
answer = rag_pipe.run(query="question")
assert isinstance(answer, Answer)
def test_rag_pipeline_other_docstore():
FakeStore = document_store_class("FakeStore")
with pytest.raises(ValueError, match="InMemoryDocumentStore"):
assert build_rag_pipeline(document_store=FakeStore())
def test_rag_pipeline_embedder_exist_if_model_is_given():
rag_pipe = build_rag_pipeline(
document_store=InMemoryDocumentStore(), embedding_model="sentence-transformers/all-mpnet-base-v2"
)
assert "text_embedder" in rag_pipe.pipeline.graph.nodes