haystack/tutorials/Tutorial14_Query_Classifier.py
Sara Zan 735ffa635b
[CI refactoring] Tutorials on CI (#2547)
* Experimental Ci workflow for running tutorials

* Run on every push for now

* Not starting?

* Disabling paths temporarily

* Sort tutorials in natural order

* Install ipython

* remove ipython install

* Try running ipython with sudo

* env.pythonLocation

* Skipping tutorial2 and 9 for speed

* typo

* Use one runner per tutorial, for now

* Typo in dependend job

* Missing quotes broke scripts matrix

* Simplify setup for the tutorials, try to prevent containers conflict

* Remove needless job dependencies

* Try prevent cache issues, fix small Tut10 bug

* Missing deps for running notebook tutorials

* Create three groups of tutorials excluding the longest among them

* remove deps

* use proper bash loop

* Try with a single string

* Fix typo in echo

* Forgot do

* Typo

* Try to make the GraphDB tutorial without launching its own container

* Run notebook and script together

* Whitespace

* separate scrpits and notebooks execution

* Run notebooks first

* Try caching the GoT data before running the scripts

* add note

* fix mkdir

* Fix path

* Update Documentation & Code Style

* missing -r

* Fix folder numbering

* Run notebooks as well

* Typo in notebook command

* complete path in notebook command

* Try with TIKA_LOG_PATH

* Fix folder naming

* Do not use cached data in Tut9

* extracting the number better

* Small tweaks

* Same fix on Tut10 on the notebook

* Exclude GoT cache for tut5 too

* Remove faiss files after tutorial run

* Layout

* fix remove command

* Fix path in tut10 notebook

* Fix typo in node name in tut14

* Third block was too long, rebancing

* Reduce GoT dataset even more, why wasting time after all...

* Fix paths in tut10 again

* do git clean to make sure to cleanup everything (breaks post Python)

* Remove ES file with bad permission at the end of the run

* Split first block, takes >30mins

* take out tut15 for a moment, has an actual bug

* typo

* Forgot rm option

* Simply remove all ES files

* Improve logs of GoT reduction

* Exclude also tut16 from cache to try fix bug

* Replace ll with ls

* Reintroduce 15_TableQA

* Small regrouping

* regrouping to make the min num of runners go for about 30mins

* Add cron schedule and PR paths conditions

* Add some timing information

* Separate tutorials by diff and tutorials by cron

* temp add pull_request to tutorials nightly

* Add badge in README to keep track of the nightly tutorials run

* Remove prefixes from data folder names

* Add fetch depth to get diff with master

* Fix paths again

* typo

* Exclude long-running ones

* Typo

* Fix tutorials.yml as well

* Use head_ref

* Using an action for now

* exclude other files

* Use only the correct command to run the tutorial

* Add long running tutorials in separate runners, just for experiment

* Factor out the complex bash script

* Pass the python path to the bash script

* Fix paths

* adding log statement

* Missing dollarsign

* Resetting variable in loop

* using mini GoT dataset and improving bash script

* change dataset name

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2022-06-15 09:53:36 +02:00

226 lines
9.1 KiB
Python

from haystack.utils import (
fetch_archive_from_http,
convert_files_to_docs,
clean_wiki_text,
launch_es,
print_answers,
print_documents,
)
from haystack.pipelines import Pipeline
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import (
BM25Retriever,
EmbeddingRetriever,
FARMReader,
TransformersQueryClassifier,
SklearnQueryClassifier,
)
def tutorial14_query_classifier():
# Download and prepare data - 517 Wikipedia articles for Game of Thrones
doc_dir = "data/tutorial14"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# convert files to dicts containing documents that can be indexed to our datastore
got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# Initialize DocumentStore and index documents
launch_es()
document_store = ElasticsearchDocumentStore()
document_store.delete_documents()
document_store.write_documents(got_docs)
# Initialize Sparse retriever
bm25_retriever = BM25Retriever(document_store=document_store)
# Initialize dense retriever
embedding_retriever = EmbeddingRetriever(
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
print()
print("Sklearn keyword classifier")
print("==========================")
# Here we build the pipeline
sklearn_keyword_classifier = Pipeline()
sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
sklearn_keyword_classifier.add_node(
component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
)
sklearn_keyword_classifier.add_node(
component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
)
sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
sklearn_keyword_classifier.draw("pipeline_classifier.png")
# Run only the dense retriever on the full sentence query
res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query
res_2 = sklearn_keyword_classifier.run(query="arya stark father")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_2, details="minimum")
# Run only the dense retriever on the full sentence query
res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query
res_4 = sklearn_keyword_classifier.run(query="jon snow country")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_4, details="minimum")
# Run only the dense retriever on the full sentence query
res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query
res_6 = sklearn_keyword_classifier.run(query="arya stark younger brothers")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_6, details="minimum")
print()
print("Transformer keyword classifier")
print("==============================")
# Here we build the pipeline
transformer_keyword_classifier = Pipeline()
transformer_keyword_classifier.add_node(
component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]
)
transformer_keyword_classifier.add_node(
component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
)
transformer_keyword_classifier.add_node(
component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]
)
transformer_keyword_classifier.add_node(
component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]
)
transformer_keyword_classifier.draw("pipeline_classifier.png")
# Run only the dense retriever on the full sentence query
res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_1, details="minimum")
# Run only the sparse retriever on a keyword based query
res_2 = transformer_keyword_classifier.run(query="arya stark father")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_2, details="minimum")
# Run only the dense retriever on the full sentence query
res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_3, details="minimum")
# Run only the sparse retriever on a keyword based query
res_4 = transformer_keyword_classifier.run(query="jon snow country")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_4, details="minimum")
# Run only the dense retriever on the full sentence query
res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_5, details="minimum")
# Run only the sparse retriever on a keyword based query
res_6 = transformer_keyword_classifier.run(query="arya stark younger brothers")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_answers(res_6, details="minimum")
print()
print("Transformer question classifier")
print("===============================")
# Here we build the pipeline
transformer_question_classifier = Pipeline()
transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
transformer_question_classifier.add_node(
component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"),
name="QueryClassifier",
inputs=["EmbeddingRetriever"],
)
transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
transformer_question_classifier.draw("question_classifier.png")
# Run only the QA reader on the question query
res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
print("\n===============================")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_1, details="minimum")
res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
print("\n===============================")
print("ES Results" + "\n" + "=" * 15)
print_documents(res_2)
# Here we create the keyword vs question/statement query classifier
queries = [
"arya stark father",
"jon snow country",
"who is the father of arya stark",
"which country was jon snow filmed?",
]
keyword_classifier = TransformersQueryClassifier()
for query in queries:
result = keyword_classifier.run(query=query)
if result[1] == "output_1":
category = "question/statement"
else:
category = "keyword"
print(f"Query: {query}, raw_output: {result}, class: {category}")
# Here we create the question vs statement query classifier
queries = [
"Lord Eddard was the father of Arya Stark.",
"Jon Snow was filmed in United Kingdom.",
"who is the father of arya stark?",
"Which country was jon snow filmed in?",
]
question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")
for query in queries:
result = question_classifier.run(query=query)
if result[1] == "output_1":
category = "question"
else:
category = "statement"
print(f"Query: {query}, raw_output: {result}, class: {category}")
if __name__ == "__main__":
tutorial14_query_classifier()
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/