diff --git a/.travis.yml b/.travis.yml index 90578ad3d..e9d1ed25b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: python -sudo: false +sudo: true cache: pip python: - "3.7" diff --git a/README.rst b/README.rst index 3d1167b25..fc7f7db07 100644 --- a/README.rst +++ b/README.rst @@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs .. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png -7. Development +7. Indexing PDF files +______________________ + +Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text. + +8. Development ------------------- * Unit tests can be executed by running :code:`tox`. \ No newline at end of file diff --git a/haystack/indexing/file_converters/__init__.py b/haystack/indexing/file_converters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/haystack/indexing/file_converters/base.py b/haystack/indexing/file_converters/base.py new file mode 100644 index 000000000..c4121da47 --- /dev/null +++ b/haystack/indexing/file_converters/base.py @@ -0,0 +1,44 @@ +from abc import abstractmethod +from pathlib import Path + + +class BaseConverter: + """ + Base class for implementing file converts to transform input documents to text format for indexing in database. + """ + + def __init__( + self, + remove_numeric_tables: bool = None, + remove_header_footer: bool = None, + remove_whitespace: bool = None, + remove_empty_lines: bool = None, + valid_languages: [str] = None, + ): + """ + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching + for the longest common string. This heuristic uses exact matches and therefore + works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" + or similar. + :param remove_whitespace: strip whitespaces before or after each line in the text. + :param remove_empty_lines: remove more than two empty lines in the text. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + """ + self.remove_numeric_tables = remove_numeric_tables + self.remove_header_footer = remove_header_footer + self.remove_whitespace = remove_whitespace + self.remove_empty_lines = remove_empty_lines + self.valid_languages = valid_languages + + @abstractmethod + def extract_pages(self, file_path: Path) -> [str]: + pass diff --git a/haystack/indexing/file_converters/pdftotext.py b/haystack/indexing/file_converters/pdftotext.py new file mode 100644 index 000000000..2dbc09b7b --- /dev/null +++ b/haystack/indexing/file_converters/pdftotext.py @@ -0,0 +1,230 @@ +import logging +import re +import subprocess +from functools import partial, reduce +from itertools import chain +from pathlib import Path + +import fitz +import langdetect + +from haystack.indexing.file_converters.base import BaseConverter + +logger = logging.getLogger(__name__) + + +class PDFToTextConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + remove_whitespace: bool = None, + remove_empty_lines: bool = None, + remove_header_footer: bool = None, + valid_languages: [str] = None, + ): + """ + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param remove_whitespace: strip whitespaces before or after each line in the text. + :param remove_empty_lines: remove more than two empty lines in the text. + :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching + for the longest common string. This heuristic uses exact matches and therefore + works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" + or similar. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + """ + verify_installation = subprocess.run(["pdftotext -v"], shell=True) + if verify_installation.returncode == 127: + raise Exception( + """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. + + Installation on Linux: + wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz && + tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin + + Installation on MacOS: + brew install xpdf + + You can find more details here: https://www.xpdfreader.com + """ + ) + + super().__init__( + remove_numeric_tables=remove_numeric_tables, + remove_whitespace=remove_whitespace, + remove_empty_lines=remove_empty_lines, + remove_header_footer=remove_header_footer, + valid_languages=valid_languages, + ) + + def extract_pages(self, file_path: Path) -> [str]: + + page_count = fitz.open(file_path).pageCount + + pages = [] + for page_number in range(1, page_count + 1): + # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour + # can be toggled by using the layout param. + # layout=True + # + table structures get retained better + # - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line + # layout=False + # + keeps strings in content stream order, hence multi column layout works well + # - cells of tables gets split across line + # + # Here, as a "safe" default, layout is turned off. + page = self._extract_page(file_path, page_number, layout=False) + lines = page.splitlines() + cleaned_lines = [] + for line in lines: + words = line.split() + digits = [word for word in words if any(i.isdigit() for i in word)] + + # remove lines having > 40% of words as digits AND not ending with a period(.) + if self.remove_numeric_tables: + if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): + logger.debug(f"Removing line '{line}' from {file_path}") + continue + + if self.remove_whitespace: + line = line.strip() + + cleaned_lines.append(line) + + page = "\n".join(cleaned_lines) + + if self.remove_empty_lines: + page = re.sub(r"\n\n+", "\n\n", page) + + pages.append(page) + page_number += 1 + + if self.valid_languages: + document_text = "".join(pages) + if not self._validate_language(document_text): + logger.warning( + f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " + f"been decoded in the correct text format." + ) + + if self.remove_header_footer: + pages, header, footer = self.find_and_remove_header_footer( + pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 + ) + logger.info(f"Removed header '{header}' and footer {footer} in {file_path}") + + return pages + + def _extract_page(self, file_path: Path, page_number: int, layout: bool): + """ + Extract a page from the pdf file at file_path. + + :param file_path: path of the pdf file + :param page_number: page number to extract(starting from 1) + :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in + the content stream order. + """ + if layout: + command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"] + else: + command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"] + output_page = subprocess.run(command, capture_output=True, shell=False) + page = output_page.stdout.decode(errors="ignore") + return page + + def _validate_language(self, text: str): + """ + Validate if the language of the text is one of valid languages. + """ + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: + lang = None + + if lang in self.valid_languages: + return True + else: + return False + + def _ngram(self, seq: str, n: int): + """ + Return ngram (of tokens - currently splitted by whitespace) + :param seq: str, string from which the ngram shall be created + :param n: int, n of ngram + :return: str, ngram as string + """ + + # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization, + # we add a space here and remove it after creation of the ngrams again (see below) + seq = seq.replace("\n", " \n") + seq = seq.replace("\t", " \t") + + seq = seq.split(" ") + ngrams = ( + " ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1) + ) + + return ngrams + + def _allngram(self, seq: str, min_ngram: int, max_ngram: int): + lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq)) + ngrams = map(partial(self._ngram, seq), lengths) + res = set(chain.from_iterable(ngrams)) + return res + + def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3): + """ + Find the longest common ngram across different text sequences (e.g. start of pages). + Considering all ngrams between the specified range. Helpful for finding footers, headers etc. + + :param sequences: list[str], list of strings that shall be searched for common n_grams + :param max_ngram: int, maximum length of ngram to consider + :param min_ngram: minimum length of ngram to consider + :return: str, common string of all sections + """ + + seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences) + intersection = reduce(set.intersection, seqs_ngrams) + + try: + longest = max(intersection, key=len) + except ValueError: + # no common sequence found + longest = "" + return longest if longest.strip() else None + + def find_and_remove_header_footer( + self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int + ): + """ + Heuristic to find footers and headers across different pages by searching for the longest common string. + For headers we only search in the first n_chars characters (for footer: last n_chars). + Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX", + but won't detect "Page 3 of 4" or similar. + + :param pages: list of strings, one string per page + :param n_chars: number of first/last characters where the header/footer shall be searched in + :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header) + :param n_last_pages_to_ignore: number of last pages to ignore + :return: (cleaned pages, found_header_str, found_footer_str) + """ + + # header + start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_header = self.find_longest_common_ngram(start_of_pages) + if found_header: + pages = [page.replace(found_header, "") for page in pages] + + # footer + end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]] + found_footer = self.find_longest_common_ngram(end_of_pages) + if found_footer: + pages = [page.replace(found_footer, "") for page in pages] + return pages, found_header, found_footer diff --git a/haystack/indexing/io.py b/haystack/indexing/utils.py similarity index 50% rename from haystack/indexing/io.py rename to haystack/indexing/utils.py index 003da53e8..e8ec48485 100644 --- a/haystack/indexing/io.py +++ b/haystack/indexing/utils.py @@ -4,57 +4,53 @@ from farm.data_handler.utils import http_get import tempfile import tarfile import zipfile +from typing import Callable +from haystack.indexing.file_converters.pdftotext import PDFToTextConverter logger = logging.getLogger(__name__) -def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False): +def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]: """ - Write all text files(.txt) in the sub-directories of the given path to the connected database. + Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a + Document Store. - :param document_dir: path for the documents to be written to the database + :param dir_path: path for the documents to be written to the database :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str) - :param only_empty_db: If true, docs will only be written if db is completely empty. - Useful to avoid indexing the same initial docs again and again. + :param split_paragraphs: split text in paragraphs. + :return: None """ - file_paths = Path(document_dir).glob("**/*.txt") - # check if db has already docs - if only_empty_db: - n_docs = document_store.get_document_count() - if n_docs > 0: - logger.info(f"Skip writing documents since DB already contains {n_docs} docs ... " - "(Disable `only_empty_db`, if you want to add docs anyway.)") - return None + file_paths = [p for p in Path(dir_path).glob("**/*")] + if ".pdf" in [p.suffix.lower() for p in file_paths]: + pdf_converter = PDFToTextConverter() + else: + pdf_converter = None - # read and add docs - docs_to_index = [] + documents = [] for path in file_paths: - with open(path) as doc: - text = doc.read() - if clean_func: - text = clean_func(text) + if path.suffix.lower() == ".txt": + with open(path) as doc: + text = doc.read() + elif path.suffix.lower() == ".pdf": + pages = pdf_converter.extract_pages(path) + text = "\n".join(pages) + else: + raise Exception(f"Indexing of {path.suffix} files is not currently supported.") - if split_paragraphs: - for para in text.split("\n\n"): - if not para.strip(): # skip empty paragraphs - continue - docs_to_index.append( - { - "name": path.name, - "text": para - } - ) - else: - docs_to_index.append( - { - "name": path.name, - "text": text - } - ) - document_store.write_documents(docs_to_index) - logger.info(f"Wrote {len(docs_to_index)} docs to DB") + if clean_func: + text = clean_func(text) + + if split_paragraphs: + for para in text.split("\n\n"): + if not para.strip(): # skip empty paragraphs + continue + documents.append({"name": path.name, "text": para}) + else: + documents.append({"name": path.name, "text": text}) + + return documents def fetch_archive_from_http(url, output_dir, proxies=None): @@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None): archive.extractall(output_dir) # temp_file gets deleted here return True + diff --git a/requirements.txt b/requirements.txt index c33be74f0..aee6ff538 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,6 @@ elasticsearch elastic-apm tox coverage +langdetect # for PDF conversions +PyMuPDF # for PDF conversions # optional: sentence-transformers diff --git a/test/conftest.py b/test/conftest.py index 14d28c713..ea8dd7e1c 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,7 +1,8 @@ import tarfile import time import urllib.request -from subprocess import Popen, PIPE, STDOUT + +from subprocess import Popen, PIPE, STDOUT, run import pytest @@ -19,3 +20,19 @@ def elasticsearch_fixture(elasticsearch_dir): thetarfile.extractall(path=elasticsearch_dir) es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT) time.sleep(30) + + +@pytest.fixture(scope="session") +def xpdf_fixture(): + verify_installation = run(["pdftotext"], shell=True) + if verify_installation.returncode == 127: + commands = """ wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz && + tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin""" + run([commands], shell=True) + + verify_installation = run(["pdftotext -v"], shell=True) + if verify_installation.returncode == 127: + raise Exception( + """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. + You can download for your OS from here: https://www.xpdfreader.com/download.html.""" + ) diff --git a/test/samples/pdf/sample_pdf_1.pdf b/test/samples/pdf/sample_pdf_1.pdf new file mode 100644 index 000000000..87259b897 Binary files /dev/null and b/test/samples/pdf/sample_pdf_1.pdf differ diff --git a/test/samples/pdf/sample_pdf_2.pdf b/test/samples/pdf/sample_pdf_2.pdf new file mode 100644 index 000000000..6384246e8 Binary files /dev/null and b/test/samples/pdf/sample_pdf_2.pdf differ diff --git a/test/test_db.py b/test/test_db.py index 89eba0f37..2a0f3ca42 100644 --- a/test/test_db.py +++ b/test/test_db.py @@ -2,12 +2,13 @@ from time import sleep from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.database.sql import SQLDocumentStore -from haystack.indexing.io import write_documents_to_db +from haystack.indexing.utils import convert_files_to_dicts def test_sql_write_read(): sql_document_store = SQLDocumentStore() - write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs") + documents = convert_files_to_dicts(dir_path="samples/docs") + sql_document_store.write_documents(documents) documents = sql_document_store.get_all_documents() assert len(documents) == 2 doc = sql_document_store.get_document_by_id("1") @@ -17,7 +18,8 @@ def test_sql_write_read(): def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() - write_documents_to_db(document_store=document_store, document_dir="samples/docs") + documents = convert_files_to_dicts(dir_path="samples/docs") + document_store.write_documents(documents) sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() assert len(documents) == 2 diff --git a/test/test_imports.py b/test/test_imports.py index e4920132d..67460178c 100644 --- a/test/test_imports.py +++ b/test/test_imports.py @@ -2,7 +2,7 @@ def test_module_imports(): from haystack import Finder from haystack.database.sql import SQLDocumentStore from haystack.indexing.cleaning import clean_wiki_text - from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http + from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.retriever.tfidf import TfidfRetriever @@ -11,7 +11,7 @@ def test_module_imports(): assert Finder is not None assert SQLDocumentStore is not None assert clean_wiki_text is not None - assert write_documents_to_db is not None + assert convert_files_to_dicts is not None assert fetch_archive_from_http is not None assert FARMReader is not None assert TransformersReader is not None diff --git a/test/test_pdf_conversion.py b/test/test_pdf_conversion.py new file mode 100644 index 000000000..130caa5f8 --- /dev/null +++ b/test/test_pdf_conversion.py @@ -0,0 +1,52 @@ +import logging +from pathlib import Path + +from haystack.indexing.file_converters.pdftotext import PDFToTextConverter + +logger = logging.getLogger(__name__) + + +def test_extract_pages(xpdf_fixture): + converter = PDFToTextConverter() + pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + assert len(pages) == 4 # the sample PDF file has four pages. + assert pages[0] != "" # the page 1 of PDF contains text. + assert pages[2] == "" # the page 3 of PDF file is empty. + + +def test_table_removal(xpdf_fixture): + converter = PDFToTextConverter(remove_numeric_tables=True) + pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + + # assert numeric rows are removed from the table. + assert "324" not in pages[0] + assert "54x growth" not in pages[0] + assert "$54.35" not in pages[0] + + # assert text is retained from the document. + assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0] + + +def test_language_validation(xpdf_fixture, caplog): + converter = PDFToTextConverter(valid_languages=["en"]) + pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text + + converter = PDFToTextConverter(valid_languages=["de"]) + pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) + assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text + + +def test_header_footer_removal(xpdf_fixture): + converter = PDFToTextConverter(remove_header_footer=True) + converter_no_removal = PDFToTextConverter(remove_header_footer=False) + + pages1 = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer + pages2 = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf")) # file contains no header/footer + for p1, p2 in zip(pages1, pages2): + assert p2 == p2 + + pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header and footer + for page in pages: + assert "header" not in page + assert "footer" not in page \ No newline at end of file diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb index 98ae1876c..ef8ca8963 100644 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb @@ -42,7 +42,7 @@ "source": [ "from haystack import Finder\n", "from haystack.indexing.cleaning import clean_wiki_text\n", - "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n", + "from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n", "from haystack.reader.farm import FARMReader\n", "from haystack.reader.transformers import TransformersReader\n", "from haystack.utils import print_answers" @@ -164,11 +164,13 @@ "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n", "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n", "\n", - "\n", - "# Now, let's write the docs to our DB.\n", + "# Convert files to dicts\n", "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n", "# It must take a str as input, and return a str.\n", - "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)" + "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n", + "\n", + "# Now, let's write the dicts containing documents to our DB.\n", + "document_store.write_documents(dicts)" ] }, { diff --git a/tutorials/Tutorial1_Basic_QA_Pipeline.py b/tutorials/Tutorial1_Basic_QA_Pipeline.py index 9a5e4981b..acfe5d1bd 100755 --- a/tutorials/Tutorial1_Basic_QA_Pipeline.py +++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py @@ -16,7 +16,7 @@ import time from haystack import Finder from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.indexing.cleaning import clean_wiki_text -from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http +from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers @@ -69,10 +69,14 @@ doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) -# Now, let's write the docs to our DB. +# convert files to dicts containing documents that can be indexed to our datastore +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. -write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True) + +# Now, let's write the docs to our DB. +document_store.write_documents(dicts) + # ## Initalize Retriever, Reader, & Finder # diff --git a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb index 3b5823054..12de56d76 100644 --- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb +++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb @@ -35,7 +35,7 @@ "source": [ "from haystack import Finder\n", "from haystack.indexing.cleaning import clean_wiki_text\n", - "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n", + "from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n", "from haystack.reader.farm import FARMReader\n", "from haystack.reader.transformers import TransformersReader\n", "from haystack.utils import print_answers" @@ -110,11 +110,13 @@ "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n", "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n", "\n", - "\n", - "# Now, let's write the docs to our DB.\n", + "# convert files to dicts containing documents that can be indexed to our datastore\n", "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n", "# It must take a str as input, and return a str.\n", - "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)" + "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n", + "\n", + "# Now, let's write the docs to our DB.\n", + "document_store.write_documents(dicts)" ] }, { diff --git a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py index c7f642006..eca060a50 100644 --- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py +++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py @@ -11,7 +11,7 @@ from haystack import Finder from haystack.database.memory import InMemoryDocumentStore from haystack.database.sql import SQLDocumentStore from haystack.indexing.cleaning import clean_wiki_text -from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http +from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.retriever.tfidf import TfidfRetriever @@ -37,11 +37,13 @@ doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) -# Now, let's write the docs to our DB. +# convert files to dicts containing documents that can be indexed to our datastore +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. -write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True) +# Now, let's write the docs to our DB. +document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder diff --git a/tutorials/Tutorial5_Evaluation.ipynb b/tutorials/Tutorial5_Evaluation.ipynb index e7c613990..dc887533e 100644 --- a/tutorials/Tutorial5_Evaluation.ipynb +++ b/tutorials/Tutorial5_Evaluation.ipynb @@ -1635,7 +1635,7 @@ }, "source": [ "\n", - "from haystack.indexing.io import fetch_archive_from_http\n", + "from haystack.indexing.utils import fetch_archive_from_http\n", "\n", "# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents\n", "doc_dir = \"../data/nq\"\n", diff --git a/tutorials/Tutorial5_Evaluation.py b/tutorials/Tutorial5_Evaluation.py index 9c783cded..6ba9f7814 100644 --- a/tutorials/Tutorial5_Evaluation.py +++ b/tutorials/Tutorial5_Evaluation.py @@ -1,5 +1,5 @@ from haystack.database.elasticsearch import ElasticsearchDocumentStore -from haystack.indexing.io import fetch_archive_from_http +from haystack.indexing.utils import fetch_archive_from_http from haystack.retriever.elasticsearch import ElasticsearchRetriever from haystack.reader.farm import FARMReader from haystack.finder import Finder