mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-21 07:51:40 +00:00

* Modify __str__ and __repr__ for Document and Answer * Rename QueryClassifier in Tutorial11 * Improve the output of tutorial1 * Make the output of Tutorial8 a bit less dense * Add a print_questions util to print the output of question generating pipelines * Replace custom printing with the new utility in Tutorial13 * Ensure all output is printed with minimal details in Tutorial14 and add some titles * Minor change to print_answers * Make tutorial3's output the same as tutorial1 * Add __repr__ to Answer and fix to_dict() * Fix a bug in the Document and Answer's __str__ method * Improve print_answers, print_documents and print_questions * Using print_answers in Tutorial7 and fixing typo in the utils * Remove duplicate line in Tutorial12 * Use print_answers in Tutorial4 * Add explanation of what the documents in the output of the basic QA pipeline are * Move the fields constant into print_answers * Normalize all 'minimal' to 'minimum' (they were mixed up) * Improve the sample output to include all fields from Document and Answer Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
102 lines
4.3 KiB
Python
102 lines
4.3 KiB
Python
from haystack.utils import convert_files_to_dicts, fetch_archive_from_http, clean_wiki_text
|
|
from haystack.nodes import Seq2SeqGenerator
|
|
|
|
|
|
def tutorial12_lfqa():
|
|
|
|
"""
|
|
Document Store:
|
|
FAISS is a library for efficient similarity search on a cluster of dense vectors.
|
|
The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
|
|
to store the document text and other meta data. The vector embeddings of the text are
|
|
indexed on a FAISS Index that later is queried for searching answers.
|
|
The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
|
|
faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
|
|
For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
|
|
"""
|
|
|
|
from haystack.document_stores.faiss import FAISSDocumentStore
|
|
|
|
document_store = FAISSDocumentStore(vector_dim=128, faiss_index_factory_str="Flat")
|
|
|
|
"""
|
|
Cleaning & indexing documents:
|
|
Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
|
|
"""
|
|
|
|
# Let's first get some files that we want to use
|
|
doc_dir = "data/article_txt_got"
|
|
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
|
|
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
|
|
|
# Convert files to dicts
|
|
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
|
|
|
|
# Now, let's write the dicts containing documents to our DB.
|
|
document_store.write_documents(dicts)
|
|
|
|
"""
|
|
Initalize Retriever and Reader/Generator:
|
|
We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
|
|
"""
|
|
|
|
from haystack.nodes import EmbeddingRetriever
|
|
|
|
retriever = EmbeddingRetriever(document_store=document_store,
|
|
embedding_model="yjernite/retribert-base-uncased",
|
|
model_format="retribert")
|
|
|
|
document_store.update_embeddings(retriever)
|
|
|
|
"""Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""
|
|
|
|
from haystack.utils import print_documents
|
|
from haystack.pipelines import DocumentSearchPipeline
|
|
|
|
p_retrieval = DocumentSearchPipeline(retriever)
|
|
res = p_retrieval.run(
|
|
query="Tell me something about Arya Stark?",
|
|
params={"Retriever": {"top_k": 1}}
|
|
)
|
|
print_documents(res, max_text_len=512)
|
|
|
|
"""
|
|
Similar to previous Tutorials we now initalize our reader/generator.
|
|
Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
|
|
"""
|
|
|
|
generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
|
|
|
|
"""
|
|
Pipeline:
|
|
With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
|
|
Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
|
|
To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
|
|
You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
|
|
"""
|
|
|
|
from haystack.pipelines import GenerativeQAPipeline
|
|
pipe = GenerativeQAPipeline(generator, retriever)
|
|
|
|
"""Voilà! Ask a question!"""
|
|
|
|
query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
|
|
result_1 = pipe.run(query=query_1, params={"Retriever": {"top_k": 1}})
|
|
print(f"Query: {query_1}")
|
|
print(f"Answer: {result_1['answers'][0]}")
|
|
print()
|
|
|
|
query_2 = "What kind of character does Arya Stark play?"
|
|
result_2 = pipe.run(query=query_2, params={"Retriever": {"top_k": 1}})
|
|
print(f"Query: {query_2}")
|
|
print(f"Answer: {result_2['answers'][0]}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
tutorial12_lfqa()
|
|
|
|
|
|
# This Haystack script was made with love by deepset in Berlin, Germany
|
|
# Haystack: https://github.com/deepset-ai/haystack
|
|
# deepset: https://deepset.ai/ |