Adding Example Scripts to Haystack (#3588)

* add 2 example scripts

* fixing faq script

* updating PR based on comments

* black

* updating s3 buckets

* first attempt at testing

* Add basic tests to two scripts

PR: #3588

* make tests runnable

* reformat files

* only run in PRs touching an example

Co-authored-by: bilgeyucel <bilgeyucel96@gmail.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
Tuana Celik 2023-01-27 14:54:59 +01:00 committed by GitHub
parent f6a99b6ebc
commit e1502c8029
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 229 additions and 0 deletions

40
.github/workflows/examples-tests.yml vendored Normal file
View File

@ -0,0 +1,40 @@
name: Examples tests
on:
workflow_dispatch: # Activate this workflow manually
push:
branches:
- main
pull_request:
paths:
- examples/**
types:
- opened
- reopened
- synchronize
- ready_for_review
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
jobs:
tests:
name: Examples
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Install Haystack
run: pip install .[all]
- name: Run
run: pytest examples/
- uses: act10ns/slack@v1
with:
status: ${{ job.status }}
channel: '#haystack'
if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main'

0
examples/__init__.py Normal file
View File

View File

@ -0,0 +1,71 @@
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.nodes.other.docs2answers import Docs2Answers
from haystack.utils import launch_es, print_answers, fetch_archive_from_http
import pandas as pd
from haystack.pipelines import Pipeline
def basic_faq_pipeline():
launch_es()
document_store = ElasticsearchDocumentStore(
host="localhost",
username="",
password="",
index="document",
embedding_field="question_emb",
embedding_dim=384,
excluded_meta_data=["question_emb"],
similarity="cosine",
)
retriever = EmbeddingRetriever(
document_store=document_store,
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
use_gpu=True,
scale_score=False,
)
doc_to_answers = Docs2Answers()
doc_dir = "data/basic_faq_pipeline"
s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
print(df.head())
# Get embeddings for our questions from the FAQs
questions = list(df["question"].values)
df["question_emb"] = retriever.embed_queries(queries=questions).tolist()
df = df.rename(columns={"question": "content"})
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)
# Initialize a Pipeline (this time without a reader) and ask questions
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"])
# Ask a question
prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
print_answers(prediction, details="medium")
return prediction
if __name__ == "__main__":
basic_faq_pipeline()

View File

@ -0,0 +1,76 @@
import logging
from pathlib import Path
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http, print_answers, launch_es
from haystack.nodes import FARMReader, BM25Retriever
from haystack.nodes.file_classifier import FileTypeClassifier
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.file_converter import TextConverter
from haystack.pipelines import Pipeline
def basic_qa_pipeline():
# launch and create DocumentStore
launch_es()
# Initialize a DocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
# fetch, pre-process and write documents
doc_dir = "data/basic_qa_pipeline"
s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/wiki_gameofthrones_txt1.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
file_paths = [p for p in Path(doc_dir).glob("**/*")]
files_metadata = [{"name": path.name} for path in file_paths]
# Indexing Pipeline
indexing_pipeline = Pipeline()
# Makes sure the file is a TXT file (FileTypeClassifier node)
classifier = FileTypeClassifier()
indexing_pipeline.add_node(classifier, name="Classifier", inputs=["File"])
# Converts a file into text and performs basic cleaning (TextConverter node)
text_converter = TextConverter(remove_numeric_tables=True)
indexing_pipeline.add_node(text_converter, name="Text_converter", inputs=["Classifier.output_1"])
# - Pre-processes the text by performing splits and adding metadata to the text (Preprocessor node)
preprocessor = PreProcessor(
clean_whitespace=True,
clean_empty_lines=True,
split_length=100,
split_overlap=50,
split_respect_sentence_boundary=True,
)
indexing_pipeline.add_node(preprocessor, name="Preprocessor", inputs=["Text_converter"])
# - Writes the resulting documents into the document store
indexing_pipeline.add_node(document_store, name="Document_Store", inputs=["Preprocessor"])
# Then we run it with the documents and their metadata as input
indexing_pipeline.run(file_paths=file_paths, meta=files_metadata)
# Initialize Retriever & Reader
retriever = BM25Retriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
# Query Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
prediction = pipeline.run(
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
print_answers(prediction, details="minimum")
return prediction
if __name__ == "__main__":
basic_qa_pipeline()

View File

@ -0,0 +1,19 @@
from examples.basic_faq_pipeline import basic_faq_pipeline
from haystack.schema import Answer, Document
def test_basic_faq_pipeline():
prediction = basic_faq_pipeline()
assert prediction is not None
assert prediction["query"] == "How is the virus spreading?"
assert len(prediction["answers"]) == 10 # top-k of Retriever
assert type(prediction["answers"][0]) == Answer
assert (
prediction["answers"][0].answer
== """This virus was first detected in Wuhan City, Hubei Province, China. The first infections were linked to a live animal market, but the virus is now spreading from person-to-person. Its important to note that person-to-person spread can happen on a continuum. Some viruses are highly contagious (like measles), while other viruses are less so.\n\nThe virus that causes COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in some affected geographic areas. Community spread means people have been infected with the virus in an area, including some who are not sure how or where they became infected.\n\nLearn what is known about the spread of newly emerged coronaviruses."""
)
assert prediction["answers"][0].score <= 1
assert prediction["answers"][0].score >= 0

View File

@ -0,0 +1,23 @@
from examples.basic_qa_pipeline import basic_qa_pipeline
from haystack.schema import Answer, Document
def test_basic_qa_pipeline():
prediction = basic_qa_pipeline()
assert prediction is not None
assert prediction["query"] == "Who is the father of Arya Stark?"
assert len(prediction["answers"]) == 5 # top-k of Reader
assert type(prediction["answers"][0]) == Answer
assert prediction["answers"][0].answer == "Ned"
assert prediction["answers"][0].score <= 1
assert prediction["answers"][0].score >= 0
assert prediction["answers"][0].meta["name"] == "43_Arya_Stark.txt"
assert len(prediction["documents"]) == 10 # top-k of Retriever
assert type(prediction["documents"][0]) == Document
assert prediction["documents"][0].score <= 1
assert prediction["documents"][0].score >= 0
assert prediction["documents"][0].meta["name"] == "450_Baelor.txt"