Add PDF text extraction (#109)

This commit is contained in:
Tanay Soni 2020-06-08 11:07:19 +02:00 committed by GitHub
parent 479fcb1ace
commit ef9e4f4467
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 421 additions and 62 deletions

View File

@ -1,5 +1,5 @@
language: python
sudo: false
sudo: true
cache: pip
python:
- "3.7"

View File

@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
.. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png
7. Development
7. Indexing PDF files
______________________
Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text.
8. Development
-------------------
* Unit tests can be executed by running :code:`tox`.

View File

@ -0,0 +1,44 @@
from abc import abstractmethod
from pathlib import Path
class BaseConverter:
"""
Base class for implementing file converts to transform input documents to text format for indexing in database.
"""
def __init__(
self,
remove_numeric_tables: bool = None,
remove_header_footer: bool = None,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
self.remove_numeric_tables = remove_numeric_tables
self.remove_header_footer = remove_header_footer
self.remove_whitespace = remove_whitespace
self.remove_empty_lines = remove_empty_lines
self.valid_languages = valid_languages
@abstractmethod
def extract_pages(self, file_path: Path) -> [str]:
pass

View File

@ -0,0 +1,230 @@
import logging
import re
import subprocess
from functools import partial, reduce
from itertools import chain
from pathlib import Path
import fitz
import langdetect
from haystack.indexing.file_converters.base import BaseConverter
logger = logging.getLogger(__name__)
class PDFToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
remove_whitespace: bool = None,
remove_empty_lines: bool = None,
remove_header_footer: bool = None,
valid_languages: [str] = None,
):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param remove_whitespace: strip whitespaces before or after each line in the text.
:param remove_empty_lines: remove more than two empty lines in the text.
:param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
for the longest common string. This heuristic uses exact matches and therefore
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
or similar.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
verify_installation = subprocess.run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
Installation on Linux:
wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin
Installation on MacOS:
brew install xpdf
You can find more details here: https://www.xpdfreader.com
"""
)
super().__init__(
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
remove_header_footer=remove_header_footer,
valid_languages=valid_languages,
)
def extract_pages(self, file_path: Path) -> [str]:
page_count = fitz.open(file_path).pageCount
pages = []
for page_number in range(1, page_count + 1):
# pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
# can be toggled by using the layout param.
# layout=True
# + table structures get retained better
# - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
# layout=False
# + keeps strings in content stream order, hence multi column layout works well
# - cells of tables gets split across line
#
# Here, as a "safe" default, layout is turned off.
page = self._extract_page(file_path, page_number, layout=False)
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = line.split()
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
if self.remove_whitespace:
line = line.strip()
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
if self.remove_empty_lines:
page = re.sub(r"\n\n+", "\n\n", page)
pages.append(page)
page_number += 1
if self.valid_languages:
document_text = "".join(pages)
if not self._validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"been decoded in the correct text format."
)
if self.remove_header_footer:
pages, header, footer = self.find_and_remove_header_footer(
pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
return pages
def _extract_page(self, file_path: Path, page_number: int, layout: bool):
"""
Extract a page from the pdf file at file_path.
:param file_path: path of the pdf file
:param page_number: page number to extract(starting from 1)
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
the content stream order.
"""
if layout:
command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
else:
command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
output_page = subprocess.run(command, capture_output=True, shell=False)
page = output_page.stdout.decode(errors="ignore")
return page
def _validate_language(self, text: str):
"""
Validate if the language of the text is one of valid languages.
"""
try:
lang = langdetect.detect(text)
except langdetect.lang_detect_exception.LangDetectException:
lang = None
if lang in self.valid_languages:
return True
else:
return False
def _ngram(self, seq: str, n: int):
"""
Return ngram (of tokens - currently splitted by whitespace)
:param seq: str, string from which the ngram shall be created
:param n: int, n of ngram
:return: str, ngram as string
"""
# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
# we add a space here and remove it after creation of the ngrams again (see below)
seq = seq.replace("\n", " \n")
seq = seq.replace("\t", " \t")
seq = seq.split(" ")
ngrams = (
" ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1)
)
return ngrams
def _allngram(self, seq: str, min_ngram: int, max_ngram: int):
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
res = set(chain.from_iterable(ngrams))
return res
def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3):
"""
Find the longest common ngram across different text sequences (e.g. start of pages).
Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
:param sequences: list[str], list of strings that shall be searched for common n_grams
:param max_ngram: int, maximum length of ngram to consider
:param min_ngram: minimum length of ngram to consider
:return: str, common string of all sections
"""
seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
intersection = reduce(set.intersection, seqs_ngrams)
try:
longest = max(intersection, key=len)
except ValueError:
# no common sequence found
longest = ""
return longest if longest.strip() else None
def find_and_remove_header_footer(
self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
):
"""
Heuristic to find footers and headers across different pages by searching for the longest common string.
For headers we only search in the first n_chars characters (for footer: last n_chars).
Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
but won't detect "Page 3 of 4" or similar.
:param pages: list of strings, one string per page
:param n_chars: number of first/last characters where the header/footer shall be searched in
:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
:param n_last_pages_to_ignore: number of last pages to ignore
:return: (cleaned pages, found_header_str, found_footer_str)
"""
# header
start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_header = self.find_longest_common_ngram(start_of_pages)
if found_header:
pages = [page.replace(found_header, "") for page in pages]
# footer
end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
found_footer = self.find_longest_common_ngram(end_of_pages)
if found_footer:
pages = [page.replace(found_footer, "") for page in pages]
return pages, found_header, found_footer

View File

@ -4,57 +4,53 @@ from farm.data_handler.utils import http_get
import tempfile
import tarfile
import zipfile
from typing import Callable
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
logger = logging.getLogger(__name__)
def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]:
"""
Write all text files(.txt) in the sub-directories of the given path to the connected database.
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.
:param document_dir: path for the documents to be written to the database
:param dir_path: path for the documents to be written to the database
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param only_empty_db: If true, docs will only be written if db is completely empty.
Useful to avoid indexing the same initial docs again and again.
:param split_paragraphs: split text in paragraphs.
:return: None
"""
file_paths = Path(document_dir).glob("**/*.txt")
# check if db has already docs
if only_empty_db:
n_docs = document_store.get_document_count()
if n_docs > 0:
logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... "
"(Disable `only_empty_db`, if you want to add docs anyway.)")
return None
file_paths = [p for p in Path(dir_path).glob("**/*")]
if ".pdf" in [p.suffix.lower() for p in file_paths]:
pdf_converter = PDFToTextConverter()
else:
pdf_converter = None
# read and add docs
docs_to_index = []
documents = []
for path in file_paths:
with open(path) as doc:
text = doc.read()
if clean_func:
text = clean_func(text)
if path.suffix.lower() == ".txt":
with open(path) as doc:
text = doc.read()
elif path.suffix.lower() == ".pdf":
pages = pdf_converter.extract_pages(path)
text = "\n".join(pages)
else:
raise Exception(f"Indexing of {path.suffix} files is not currently supported.")
if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
docs_to_index.append(
{
"name": path.name,
"text": para
}
)
else:
docs_to_index.append(
{
"name": path.name,
"text": text
}
)
document_store.write_documents(docs_to_index)
logger.info(f"Wrote {len(docs_to_index)} docs to DB")
if clean_func:
text = clean_func(text)
if split_paragraphs:
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"name": path.name, "text": para})
else:
documents.append({"name": path.name, "text": text})
return documents
def fetch_archive_from_http(url, output_dir, proxies=None):
@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None):
archive.extractall(output_dir)
# temp_file gets deleted here
return True

View File

@ -9,4 +9,6 @@ elasticsearch
elastic-apm
tox
coverage
langdetect # for PDF conversions
PyMuPDF # for PDF conversions
# optional: sentence-transformers

View File

@ -1,7 +1,8 @@
import tarfile
import time
import urllib.request
from subprocess import Popen, PIPE, STDOUT
from subprocess import Popen, PIPE, STDOUT, run
import pytest
@ -19,3 +20,19 @@ def elasticsearch_fixture(elasticsearch_dir):
thetarfile.extractall(path=elasticsearch_dir)
es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT)
time.sleep(30)
@pytest.fixture(scope="session")
def xpdf_fixture():
verify_installation = run(["pdftotext"], shell=True)
if verify_installation.returncode == 127:
commands = """ wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin"""
run([commands], shell=True)
verify_installation = run(["pdftotext -v"], shell=True)
if verify_installation.returncode == 127:
raise Exception(
"""pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
You can download for your OS from here: https://www.xpdfreader.com/download.html."""
)

Binary file not shown.

Binary file not shown.

View File

@ -2,12 +2,13 @@ from time import sleep
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.io import write_documents_to_db
from haystack.indexing.utils import convert_files_to_dicts
def test_sql_write_read():
sql_document_store = SQLDocumentStore()
write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs")
documents = convert_files_to_dicts(dir_path="samples/docs")
sql_document_store.write_documents(documents)
documents = sql_document_store.get_all_documents()
assert len(documents) == 2
doc = sql_document_store.get_document_by_id("1")
@ -17,7 +18,8 @@ def test_sql_write_read():
def test_elasticsearch_write_read(elasticsearch_fixture):
document_store = ElasticsearchDocumentStore()
write_documents_to_db(document_store=document_store, document_dir="samples/docs")
documents = convert_files_to_dicts(dir_path="samples/docs")
document_store.write_documents(documents)
sleep(2) # wait for documents to be available for query
documents = document_store.get_all_documents()
assert len(documents) == 2

View File

@ -2,7 +2,7 @@ def test_module_imports():
from haystack import Finder
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
@ -11,7 +11,7 @@ def test_module_imports():
assert Finder is not None
assert SQLDocumentStore is not None
assert clean_wiki_text is not None
assert write_documents_to_db is not None
assert convert_files_to_dicts is not None
assert fetch_archive_from_http is not None
assert FARMReader is not None
assert TransformersReader is not None

View File

@ -0,0 +1,52 @@
import logging
from pathlib import Path
from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
logger = logging.getLogger(__name__)
def test_extract_pages(xpdf_fixture):
converter = PDFToTextConverter()
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert len(pages) == 4 # the sample PDF file has four pages.
assert pages[0] != "" # the page 1 of PDF contains text.
assert pages[2] == "" # the page 3 of PDF file is empty.
def test_table_removal(xpdf_fixture):
converter = PDFToTextConverter(remove_numeric_tables=True)
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
# assert numeric rows are removed from the table.
assert "324" not in pages[0]
assert "54x growth" not in pages[0]
assert "$54.35" not in pages[0]
# assert text is retained from the document.
assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0]
def test_language_validation(xpdf_fixture, caplog):
converter = PDFToTextConverter(valid_languages=["en"])
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
converter = PDFToTextConverter(valid_languages=["de"])
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
def test_header_footer_removal(xpdf_fixture):
converter = PDFToTextConverter(remove_header_footer=True)
converter_no_removal = PDFToTextConverter(remove_header_footer=False)
pages1 = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
pages2 = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer
for p1, p2 in zip(pages1, pages2):
assert p2 == p2
pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer
for page in pages:
assert "header" not in page
assert "footer" not in page

View File

@ -42,7 +42,7 @@
"source": [
"from haystack import Finder\n",
"from haystack.indexing.cleaning import clean_wiki_text\n",
"from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
"from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
"from haystack.reader.farm import FARMReader\n",
"from haystack.reader.transformers import TransformersReader\n",
"from haystack.utils import print_answers"
@ -164,11 +164,13 @@
"s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
"\n",
"\n",
"# Now, let's write the docs to our DB.\n",
"# Convert files to dicts\n",
"# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
"# It must take a str as input, and return a str.\n",
"write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
"dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
"\n",
"# Now, let's write the dicts containing documents to our DB.\n",
"document_store.write_documents(dicts)"
]
},
{

View File

@ -16,7 +16,7 @@ import time
from haystack import Finder
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
@ -69,10 +69,14 @@ doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# Now, let's write the docs to our DB.
# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)
# Now, let's write the docs to our DB.
document_store.write_documents(dicts)
# ## Initalize Retriever, Reader, & Finder
#

View File

@ -35,7 +35,7 @@
"source": [
"from haystack import Finder\n",
"from haystack.indexing.cleaning import clean_wiki_text\n",
"from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
"from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
"from haystack.reader.farm import FARMReader\n",
"from haystack.reader.transformers import TransformersReader\n",
"from haystack.utils import print_answers"
@ -110,11 +110,13 @@
"s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
"\n",
"\n",
"# Now, let's write the docs to our DB.\n",
"# convert files to dicts containing documents that can be indexed to our datastore\n",
"# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
"# It must take a str as input, and return a str.\n",
"write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
"dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
"\n",
"# Now, let's write the docs to our DB.\n",
"document_store.write_documents(dicts)"
]
},
{

View File

@ -11,7 +11,7 @@ from haystack import Finder
from haystack.database.memory import InMemoryDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.retriever.tfidf import TfidfRetriever
@ -37,11 +37,13 @@ doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# Now, let's write the docs to our DB.
# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)
# Now, let's write the docs to our DB.
document_store.write_documents(dicts)
# ## Initalize Retriever, Reader, & Finder

View File

@ -1635,7 +1635,7 @@
},
"source": [
"\n",
"from haystack.indexing.io import fetch_archive_from_http\n",
"from haystack.indexing.utils import fetch_archive_from_http\n",
"\n",
"# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents\n",
"doc_dir = \"../data/nq\"\n",

View File

@ -1,5 +1,5 @@
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.indexing.io import fetch_archive_from_http
from haystack.indexing.utils import fetch_archive_from_http
from haystack.retriever.elasticsearch import ElasticsearchRetriever
from haystack.reader.farm import FARMReader
from haystack.finder import Finder