From c52266e52037bd3757bb74fef3b7d57ffcbe5660 Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Thu, 23 Jan 2020 15:18:41 +0100 Subject: [PATCH] Update tutorials (#12) Co-authored-by: Malte Pietsch --- haystack/api/inference.py | 22 ++++------- tutorials/Tutorial1_Basic_QA_Pipeline.ipynb | 39 ++++++++++--------- tutorials/Tutorial1_Basic_QA_Pipeline.py | 39 ++++++++++--------- ...Tutorial2_Finetune_a_model_on_your_data.py | 34 +++++++++------- 4 files changed, 69 insertions(+), 65 deletions(-) diff --git a/haystack/api/inference.py b/haystack/api/inference.py index 7768d41ce..1ef31b122 100644 --- a/haystack/api/inference.py +++ b/haystack/api/inference.py @@ -4,9 +4,9 @@ from fastapi import FastAPI, HTTPException import logging from haystack import Finder -from haystack.database import app from haystack.reader.farm import FARMReader from haystack.retriever.tfidf import TfidfRetriever +from haystack.database.sql import SQLDocumentStore from pydantic import BaseModel from typing import List, Dict @@ -19,25 +19,19 @@ logger = logging.getLogger(__name__) MODELS_DIRS = ["saved_models", "models", "model"] USE_GPU = False BATCH_SIZE = 16 +DATABASE_URL = "sqlite:///qa.db" +MODEL_PATHS = ['deepset/bert-base-cased-squad2'] app = FastAPI(title="Haystack API", version="0.1") -############################################# -# Load all models in memory -############################################# -model_paths = [] -for model_dir in MODELS_DIRS: - path = Path(model_dir) - if path.is_dir(): - models = [f for f in path.iterdir() if f.is_dir()] - model_paths.extend(models) +if len(MODEL_PATHS) == 0: + logger.error(f"No model to load. Please specify one via MODEL_PATHS (e.g. ['deepset/bert-base-cased-squad2']") -if len(model_paths) == 0: - logger.error(f"Could not find any model to load. Checked folders: {MODELS_DIRS}") +datastore = SQLDocumentStore(url=DATABASE_URL) +retriever = TfidfRetriever(datastore=datastore) -retriever = TfidfRetriever() FINDERS = {} -for idx, model_dir in enumerate(model_paths, start=1): +for idx, model_dir in enumerate(MODEL_PATHS, start=1): reader = FARMReader(model_name_or_path=str(model_dir), batch_size=BATCH_SIZE, use_gpu=USE_GPU) FINDERS[idx] = Finder(reader, retriever) logger.info(f"Initialized Finder (ID={idx}) with model '{model_dir}'") diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index b989a8208..f4839e7ea 100644 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -40,11 +40,13 @@ }, "outputs": [], "source": [ - "from haystack.reader.farm import FARMReader\n", - "from haystack.retriever.tfidf import TfidfRetriever\n", "from haystack import Finder\n", - "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n", + "from haystack.database.sql import SQLDocumentStore\n", "from haystack.indexing.cleaning import clean_wiki_text\n", + "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n", + "from haystack.reader.farm import FARMReader\n", + "from haystack.reader.transformers import TransformersReader\n", + "from haystack.retriever.tfidf import TfidfRetriever\n", "from haystack.utils import print_answers" ] }, @@ -75,20 +77,21 @@ } ], "source": [ - "# Init a database (default: sqllite)\n", - "from haystack.database import db\n", - "db.create_all()\n", - "\n", "# Let's first get some documents that we want to query\n", "# Here: 517 Wikipedia articles for Game of Thrones\n", "doc_dir = \"data/article_txt_got\"\n", "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n", "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n", "\n", - "# Now, let's write the docs to our DB. \n", - "# You can supply a cleaning function that is applied to each doc (e.g. to remove footers)\n", + "# The documents can be stored in different types of \"DocumentStores\".\n", + "# For dev we suggest a light-weight SQL DB\n", + "# For production we suggest elasticsearch\n", + "datastore = SQLDocumentStore(url=\"sqlite:///qa.db\")\n", + "\n", + "# Now, let's write the docs to our DB.\n", + "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n", "# It must take a str as input, and return a str.\n", - "write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)" + "write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True)" ] }, { @@ -119,7 +122,7 @@ "source": [ "# A retriever identifies the k most promising chunks of text that might contain the answer for our question\n", "# Retrievers use some simple but fast algorithm, here: TF-IDF\n", - "retriever = TfidfRetriever()" + "retriever = TfidfRetriever(datastore=datastore)" ] }, { @@ -143,13 +146,13 @@ ], "source": [ "# A reader scans the text chunks in detail and extracts the k best answers\n", - "# Reader use more powerful but slower deep learning models, here: a BERT QA model trained via FARM on Squad 2.0\n", - "from haystack.indexing.io import fetch_archive_from_http\n", - "fetch_archive_from_http(url=\"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-models/0.3.0/bert-english-qa-large.tar.gz\", output_dir=\"model\")\n", - "reader = FARMReader(model_dir=\"model/bert-english-qa-large\", use_gpu=False)\n", + "# Reader use more powerful but slower deep learning models\n", + "# You can select a local model or any of the QA models published on huggingface's model hub (https://huggingface.co/models)\n", + "# here: a medium sized BERT QA model trained via FARM on Squad 2.0\n", + "reader = FARMReader(model_name_or_path=\"deepset/bert-base-cased-squad2\", use_gpu=False)\n", "\n", - "# OR: use alternatively a reader from huggingface's Transformers package\n", - "# reader = TransformersReader(use_gpu=-1)" + "# OR: use alternatively a reader from huggingface's transformers package (https://github.com/huggingface/transformers)\n", + "# reader = TransformersReader(model=\"distilbert-base-uncased-distilled-squad\", tokenizer=\"distilbert-base-uncased\", use_gpu=-1)" ] }, { @@ -162,7 +165,7 @@ }, "outputs": [], "source": [ - "# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions \n", + "# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions\n", "finder = Finder(reader, retriever)" ] }, diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py index 31d3b428e..410d118e9 100755 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.py +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py @@ -1,16 +1,14 @@ +from haystack import Finder +from haystack.database.sql import SQLDocumentStore +from haystack.indexing.cleaning import clean_wiki_text +from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.retriever.tfidf import TfidfRetriever -from haystack import Finder -from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http -from haystack.indexing.cleaning import clean_wiki_text from haystack.utils import print_answers ## Indexing & cleaning documents -# Init a database (default: sqllite) -from haystack.database import db -db.create_all() # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones @@ -18,25 +16,31 @@ doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) -# Now, let's write the docs to our DB. -# You can supply a cleaning function that is applied to each doc (e.g. to remove footers) -# It must take a str as input, and return a str. -write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) +# The documents can be stored in different types of "DocumentStores". +# For dev we suggest a light-weight SQL DB +# For production we suggest elasticsearch +datastore = SQLDocumentStore(url="sqlite:///qa.db") + +# Now, let's write the docs to our DB. +# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) +# It must take a str as input, and return a str. +write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) ## Initalize Reader, Retriever & Finder # A retriever identifies the k most promising chunks of text that might contain the answer for our question # Retrievers use some simple but fast algorithm, here: TF-IDF -retriever = TfidfRetriever() +retriever = TfidfRetriever(datastore=datastore) # A reader scans the text chunks in detail and extracts the k best answers -# Reader use more powerful but slower deep learning models, here: a BERT QA model trained via FARM on Squad 2.0 -fetch_archive_from_http(url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-models/0.3.0/bert-english-qa-large.tar.gz", output_dir="model") -reader = FARMReader(model_name_or_path="model/bert-english-qa-large", use_gpu=False) +# Reader use more powerful but slower deep learning models +# You can select a local model or any of the QA models published on huggingface's model hub (https://huggingface.co/models) +# here: a medium sized BERT QA model trained via FARM on Squad 2.0 +reader = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2", use_gpu=False) -# OR: use alternatively a reader from huggingface's Transformers package -# reader = TransformersReader(use_gpu=-1) +# OR: use alternatively a reader from huggingface's transformers package (https://github.com/huggingface/transformers) +# reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # The Finder sticks together retriever and retriever in a pipeline to answer our actual questions finder = Finder(reader, retriever) @@ -50,6 +54,3 @@ prediction = finder.get_answers(question="Who is the father of Arya Stark?", top #prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal") - - - diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py index ff3b1550d..9345cc21c 100755 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py @@ -1,9 +1,10 @@ -from haystack.reader.farm import FARMReader -from haystack.reader.transformers import TransformersReader -from haystack.retriever.tfidf import TfidfRetriever + from haystack import Finder -from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http +from haystack.database.sql import SQLDocumentStore from haystack.indexing.cleaning import clean_wiki_text +from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http +from haystack.reader.farm import FARMReader +from haystack.retriever.tfidf import TfidfRetriever from haystack.utils import print_answers #### TRAINING ############# @@ -11,26 +12,31 @@ from haystack.utils import print_answers reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) # and fine-tune it on your own custom dataset (should be in SQuAD like format) -reader.train(data_dir="../data/squad_small", train_filename="train.json", use_gpu=False, n_epochs=1) +train_data = "PATH/TO_YOUR/TRAIN_DATA" +reader.train(data_dir=train_data, train_filename="train.json", use_gpu=False, n_epochs=1) #### Use it (same as in Tutorial 1) ############# -# Okay, we have a fine-tuned model now. Let's test it on some docs: -## Let's get some docs for testing (see Tutorial 1 for more explanations) -from haystack.database import db -db.create_all() +## Indexing & cleaning documents -# Download docs +# Let's get the data (Game of thrones articles from wikipedia) doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) -# Write docs to our DB. -write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) -# Initialize Finder Pipeline -retriever = TfidfRetriever() +# Init Document store & write docs to it +datastore = SQLDocumentStore(url="sqlite:///qa.db") +write_documents_to_db(datastore=datastore, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) + +## Initalize Reader, Retriever & Finder + +# A retriever identifies the k most promising chunks of text that might contain the answer for our question +# Retrievers use some simple but fast algorithm, here: TF-IDF +retriever = TfidfRetriever(datastore=datastore) + +# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions finder = Finder(reader, retriever) ## Voilá! Ask a question!