diff --git a/farm_haystack/indexing/io.py b/farm_haystack/indexing/io.py index ea00db2c9..a9dd70c56 100644 --- a/farm_haystack/indexing/io.py +++ b/farm_haystack/indexing/io.py @@ -27,7 +27,7 @@ def write_documents_to_db(document_dir, clean_func=None): db.session.add(doc) db.session.commit() n_docs += 1 - logger.info(f"Wrote {n_docs} to DB") + logger.info(f"Wrote {n_docs} docs to DB") def fetch_archive_from_http(url, output_dir, proxies=None): diff --git a/farm_haystack/retriever/tfidf.py b/farm_haystack/retriever/tfidf.py index a7745557f..d371f84af 100644 --- a/farm_haystack/retriever/tfidf.py +++ b/farm_haystack/retriever/tfidf.py @@ -66,7 +66,7 @@ class TfidfRetriever(BaseRetriever): Paragraph(document_id=doc.id, paragraph_id=p_id, text=(p,)) ) p_id += 1 - logger.info(f"Found {len(documents)} candidates in DB") + logger.info(f"Found {len(paragraphs)} candidate passages from {len(documents)} docs in DB") return paragraphs def retrieve(self, query, candidate_doc_ids=None, top_k=10): diff --git a/tutorial.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb similarity index 66% rename from tutorial.ipynb rename to tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index 365486227..f186af2ab 100644 --- a/tutorial.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -2,23 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "11/11/2019 18:43:25 - INFO - root - Using localhost sqlite as the database backend. as Database not configured. Add a qa_config.py file in the Python path with DATABASE_URL set.Continuing with the default sqlite on localhost.\n", - "I1111 18:43:25.533496 140652943304512 file_utils.py:39] PyTorch version 1.3.0 available.\n", - "I1111 18:43:25.611577 140652943304512 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" - ] - } - ], + "outputs": [], "source": [ "from farm_haystack.reader.adaptive_model import FARMReader\n", "from farm_haystack.retriever.tfidf import TfidfRetriever\n", @@ -28,66 +18,77 @@ "from farm_haystack.utils import print_answers" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Task: Question Answering for Game of Thrones\n", + "\n", + "\n", + "\n", + "Question Answering can be used in a variety of use cases. A very common one: Using it to navigate through complex knowledge bases or long documents (\"search setting\").\n", + "\n", + "A \"knowledge base\" could for example be your website, an internal wiki or a collection of financial reports. \n", + "In this tutorial we will work on a slightly different domain: \"Game of Thrones\". \n", + "\n", + "Let's see how we can use a bunch of wikipedia articles to answer a variety of questions about the \n", + "marvellous seven kingdoms... \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing & cleaning documents" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "I1111 18:43:25.834958 140652943304512 io.py:46] Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip to `data/article_txt_got`\n", - "\r", - " 0%| | 0/1167348 [00:00