Move and update tutorial. Improve logging

2026-01-07 04:27:15 +00:00 · 2019-11-25 16:01:32 +01:00 · 2019-11-25 16:01:32 +01:00 · d32fd12aed
commit d32fd12aed
parent 6f8a015145
3 changed files with 62 additions and 62 deletions
--- a/farm_haystack/indexing/io.py
+++ b/farm_haystack/indexing/io.py
@ -27,7 +27,7 @@ def write_documents_to_db(document_dir, clean_func=None):
            db.session.add(doc)
            db.session.commit()
        n_docs += 1
-    logger.info(f"Wrote {n_docs} to DB")
+    logger.info(f"Wrote {n_docs} docs to DB")


 def fetch_archive_from_http(url, output_dir, proxies=None):
--- a/farm_haystack/retriever/tfidf.py
+++ b/farm_haystack/retriever/tfidf.py
@ -66,7 +66,7 @@ class TfidfRetriever(BaseRetriever):
                    Paragraph(document_id=doc.id, paragraph_id=p_id, text=(p,))
                )
                p_id += 1
-        logger.info(f"Found {len(documents)} candidates in DB")
+        logger.info(f"Found {len(paragraphs)} candidate passages from {len(documents)} docs in DB")
        return paragraphs

    def retrieve(self, query, candidate_doc_ids=None, top_k=10):
--- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
+++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
@ -2,23 +2,13 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "11/11/2019 18:43:25 - INFO - root -   Using localhost sqlite as the database backend. as Database not configured. Add a qa_config.py file in the Python path with DATABASE_URL set.Continuing with the default sqlite on localhost.\n",
-      "I1111 18:43:25.533496 140652943304512 file_utils.py:39] PyTorch version 1.3.0 available.\n",
-      "I1111 18:43:25.611577 140652943304512 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from farm_haystack.reader.adaptive_model import FARMReader\n",
    "from farm_haystack.retriever.tfidf import TfidfRetriever\n",
@ -28,66 +18,77 @@
    "from farm_haystack.utils import print_answers"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task: Question Answering for Game of Thrones\n",
+    "\n",
+    "<img style=\"float: right;\" src=\"https://upload.wikimedia.org/wikipedia/en/d/d8/Game_of_Thrones_title_card.jpg\">\n",
+    "\n",
+    "Question Answering can be used in a variety of use cases. A very common one:  Using it to navigate through complex knowledge bases or long documents (\"search setting\").\n",
+    "\n",
+    "A \"knowledge base\" could for example be your website, an internal wiki or a collection of financial reports. \n",
+    "In this tutorial we will work on a slightly different domain: \"Game of Thrones\". \n",
+    "\n",
+    "Let's see how we can use a bunch of wikipedia articles to answer a variety of questions about the \n",
+    "marvellous seven kingdoms...  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Indexing & cleaning documents"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1111 18:43:25.834958 140652943304512 io.py:46] Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip to `data/article_txt_got`\n",
-      "\r",
-      "  0%|          | 0/1167348 [00:00<?, ?B/s]\r",
-      "100%|██████████| 1167348/1167348 [00:00<00:00, 13662984.26B/s]\n",
-      "I1111 18:43:26.548232 140652943304512 io.py:31] Wrote 517 to DB\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Init a database (default: sqllite)\n",
    "from farm_haystack.database import db\n",
    "db.create_all()\n",
    "\n",
    "# Let's first get some documents that we want to query\n",
-    "# Here: Wikipedia articles for Game of Thrones\n",
-    "\n",
+    "# Here: 517 Wikipedia articles for Game of Thrones\n",
    "doc_dir = \"data/article_txt_got\"\n",
    "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
-    "\n",
    "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
    "\n",
-    "# We supply a function to clean the docs and write them afterwards to our DB\n",
+    "# Now, let's write the docs to our DB. \n",
+    "# You can supply a cleaning/transformation function that is applied to each doc (e.g. to remove headers)\n",
+    "# It must take a str as input, and return a str.\n",
+    "# Our 517 documents also get splitted into 2068 smaller paragraphs here. \n",
    "write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initalize Reader, Retriever & Finder"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": false,
     "name": "#%%\n"
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "I1111 18:43:26.589386 140652943304512 tfidf.py:68] Found 517 candidates in DB\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# A retriever identifies the k most promising chunks of text that might contain the answer for our question\n",
    "# Retrievers use some simple but fast algorithm, here: TF-IDF\n",
-    "\n",
    "retriever = TfidfRetriever()"
   ]
  },
@ -117,10 +118,16 @@
   "outputs": [],
   "source": [
    "# The Finder sticks together retriever and retriever in a pipeline to answer our actual questions \n",
-    "\n",
    "finder = Finder(reader, retriever)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voilá! Ask a question!"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -131,11 +138,17 @@
   },
   "outputs": [],
   "source": [
-    "# Voilá! Ask a question!\n",
-    "\n",
+    "prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_reader=10, top_k_retriever=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "#prediction = finder.get_answers(question=\"Who created the Dothraki vocabulary?\", top_k_reader=5)\n",
-    "#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)\n",
-    "prediction = finder.get_answers(question=\"Who is the father of Arya Stark?\", top_k_reader=3, top_k_retriever=5)"
+    "#prediction = finder.get_answers(question=\"Who is the sister of Sansa?\", top_k_reader=5)"
   ]
  },
  {
@ -151,19 +164,6 @@
   "source": [
    "print_answers(prediction, details=\"minimal\")"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "\n"
-   ]
  }
 ],
 "metadata": {
@ -182,7 +182,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {