mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-05 03:17:31 +00:00
Move and update tutorial. Improve logging
This commit is contained in:
parent
6f8a015145
commit
d32fd12aed
@ -27,7 +27,7 @@ def write_documents_to_db(document_dir, clean_func=None):
|
||||
db.session.add(doc)
|
||||
db.session.commit()
|
||||
n_docs += 1
|
||||
logger.info(f"Wrote {n_docs} to DB")
|
||||
logger.info(f"Wrote {n_docs} docs to DB")
|
||||
|
||||
|
||||
def fetch_archive_from_http(url, output_dir, proxies=None):
|
||||
|
||||
@ -66,7 +66,7 @@ class TfidfRetriever(BaseRetriever):
|
||||
Paragraph(document_id=doc.id, paragraph_id=p_id, text=(p,))
|
||||
)
|
||||
p_id += 1
|
||||
logger.info(f"Found {len(documents)} candidates in DB")
|
||||
logger.info(f"Found {len(paragraphs)} candidate passages from {len(documents)} docs in DB")
|
||||
return paragraphs
|
||||
|
||||
def retrieve(self, query, candidate_doc_ids=None, top_k=10):
|
||||
|
||||
@ -2,23 +2,13 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"11/11/2019 18:43:25 - INFO - root - Using localhost sqlite as the database backend. as Database not configured. Add a qa_config.py file in the Python path with DATABASE_URL set.Continuing with the default sqlite on localhost.\n",
|
||||
"I1111 18:43:25.533496 140652943304512 file_utils.py:39] PyTorch version 1.3.0 available.\n",
|
||||
"I1111 18:43:25.611577 140652943304512 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from farm_haystack.reader.adaptive_model import FARMReader\n",
|
||||
"from farm_haystack.retriever.tfidf import TfidfRetriever\n",
|
||||
@ -28,66 +18,77 @@
|
||||
"from farm_haystack.utils import print_answers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Task: Question Answering for Game of Thrones\n",
|
||||
"\n",
|
||||
"<img style=\"float: right;\" src=\"https://upload.wikimedia.org/wikipedia/en/d/d8/Game_of_Thrones_title_card.jpg\">\n",
|
||||
"\n",
|
||||
"Question Answering can be used in a variety of use cases. A very common one: Using it to navigate through complex knowledge bases or long documents (\"search setting\").\n",
|
||||
"\n",
|
||||
"A \"knowledge base\" could for example be your website, an internal wiki or a collection of financial reports. \n",
|
||||
"In this tutorial we will work on a slightly different domain: \"Game of Thrones\". \n",
|
||||
"\n",
|
||||
"Let's see how we can use a bunch of wikipedia articles to answer a variety of questions about the \n",
|
||||
"marvellous seven kingdoms... \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Indexing & cleaning documents"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I1111 18:43:25.834958 140652943304512 io.py:46] Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip to `data/article_txt_got`\n",
|
||||
"\r",
|
||||
" 0%| | 0/1167348 [00:00<?, ?B/s]\r",
|
||||
"100%|██████████| 1167348/1167348 [00:00<00:00, 13662984.26B/s]\n",
|
||||
"I1111 18:43:26.548232 140652943304512 io.py:31] Wrote 517 to DB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Init a database (default: sqllite)\n",
|
||||
"from farm_haystack.database import db\n",
|
||||
"db.create_all()\n",
|
||||
"\n",
|
||||
"# Let's first get some documents that we want to query\n",
|
||||
"# Here: Wikipedia articles for Game of Thrones\n",
|
||||
"\n",
|
||||
"# Here: 517 Wikipedia articles for Game of Thrones\n",
|
||||
"doc_dir = \"data/article_txt_got\"\n",
|
||||
"s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
|
||||
"\n",
|
||||
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
|
||||
"\n",
|
||||
"# We supply a function to clean the docs and write them afterwards to our DB\n",
|
||||
"# Now, let's write the docs to our DB. \n",
|
||||
"# You can supply a cleaning/transformation function that is applied to each doc (e.g. to remove headers)\n",
|
||||
"# It must take a str as input, and return a str.\n",
|
||||
"# Our 517 documents also get splitted into 2068 smaller paragraphs here. \n",
|
||||
"write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initalize Reader, Retriever & Finder"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"is_executing": false,
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I1111 18:43:26.589386 140652943304512 tfidf.py:68] Found 517 candidates in DB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A retriever identifies the k most promising chunks of text that might contain the answer for our question\n",
|
||||
"# Retrievers use some simple but fast algorithm, here: TF-IDF\n",
|
||||
"\n",
|
||||
"retriever = TfidfRetriever()"
|
||||
]
|
||||
},
|
||||
@ -117,10 +118,16 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions \n",
|
||||
"\n",
|
||||
"finder = Finder(reader, retriever)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Voilá! Ask a question!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -131,11 +138,17 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Voilá! Ask a question!\n",
|
||||
"\n",
|
||||
"prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_reader=10, top_k_retriever=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n",
|
||||
"#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)\n",
|
||||
"prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_reader=3, top_k_retriever=5)"
|
||||
"#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -151,19 +164,6 @@
|
||||
"source": [
|
||||
"print_answers(prediction, details=\"minimal\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -182,7 +182,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
"version": "3.7.3"
|
||||
},
|
||||
"pycharm": {
|
||||
"stem_cell": {
|
||||
Loading…
x
Reference in New Issue
Block a user