Add PDF text extraction (#109)

2026-01-05 19:47:45 +00:00 · 2020-06-08 11:07:19 +02:00 · 2020-06-08 11:07:19 +02:00 · ef9e4f4467
commit ef9e4f4467
parent 479fcb1ace
19 changed files with 421 additions and 62 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,5 +1,5 @@
 language: python
-sudo: false
+sudo: true
 cache: pip
 python:
  - "3.7"
--- a/README.rst
+++ b/README.rst
@ -228,6 +228,11 @@ You will find the Swagger API documentation at http://127.0.0.1:80/docs
 .. image:: https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png


-7. Development
+7. Indexing PDF files
+______________________
+
+Haystack has a customizable PDF text extraction pipeline with cleaning functions for header, footers, and tables. It supports complex document layouts with multi-column text.
+
+8. Development
 -------------------
 * Unit tests can be executed by running :code:`tox`.
--- a/haystack/indexing/file_converters/init.py
+++ b/haystack/indexing/file_converters/init.py
--- a/haystack/indexing/file_converters/base.py
+++ b/haystack/indexing/file_converters/base.py
@ -0,0 +1,44 @@
+from abc import abstractmethod
+from pathlib import Path
+
+
+class BaseConverter:
+    """
+    Base class for implementing file converts to transform input documents to text format for indexing in database.
+    """
+
+    def __init__(
+        self,
+        remove_numeric_tables: bool = None,
+        remove_header_footer: bool = None,
+        remove_whitespace: bool = None,
+        remove_empty_lines: bool = None,
+        valid_languages: [str] = None,
+    ):
+        """
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
+                                     for the longest common string. This heuristic uses exact matches and therefore
+                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
+                                     or similar.
+        :param remove_whitespace: strip whitespaces before or after each line in the text.
+        :param remove_empty_lines: remove more than two empty lines in the text.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        """
+        self.remove_numeric_tables = remove_numeric_tables
+        self.remove_header_footer = remove_header_footer
+        self.remove_whitespace = remove_whitespace
+        self.remove_empty_lines = remove_empty_lines
+        self.valid_languages = valid_languages
+
+    @abstractmethod
+    def extract_pages(self, file_path: Path) -> [str]:
+        pass
--- a/haystack/indexing/file_converters/pdftotext.py
+++ b/haystack/indexing/file_converters/pdftotext.py
@ -0,0 +1,230 @@
+import logging
+import re
+import subprocess
+from functools import partial, reduce
+from itertools import chain
+from pathlib import Path
+
+import fitz
+import langdetect
+
+from haystack.indexing.file_converters.base import BaseConverter
+
+logger = logging.getLogger(__name__)
+
+
+class PDFToTextConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        remove_whitespace: bool = None,
+        remove_empty_lines: bool = None,
+        remove_header_footer: bool = None,
+        valid_languages: [str] = None,
+    ):
+        """
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param remove_whitespace: strip whitespaces before or after each line in the text.
+        :param remove_empty_lines: remove more than two empty lines in the text.
+        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
+                                     for the longest common string. This heuristic uses exact matches and therefore
+                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
+                                     or similar.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
+        """
+        verify_installation = subprocess.run(["pdftotext -v"], shell=True)
+        if verify_installation.returncode == 127:
+            raise Exception(
+                """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
+                
+                   Installation on Linux:
+                   wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
+                   tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin
+                   
+                   Installation on MacOS:
+                   brew install xpdf
+                   
+                   You can find more details here: https://www.xpdfreader.com
+                """
+            )
+
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables,
+            remove_whitespace=remove_whitespace,
+            remove_empty_lines=remove_empty_lines,
+            remove_header_footer=remove_header_footer,
+            valid_languages=valid_languages,
+        )
+
+    def extract_pages(self, file_path: Path) -> [str]:
+
+        page_count = fitz.open(file_path).pageCount
+
+        pages = []
+        for page_number in range(1, page_count + 1):
+            # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
+            # can be toggled by using the layout param.
+            #  layout=True
+            #      + table structures get retained better
+            #      - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
+            #  layout=False
+            #      + keeps strings in content stream order, hence multi column layout works well
+            #      - cells of tables gets split across line
+            #
+            #  Here, as a "safe" default, layout is turned off.
+            page = self._extract_page(file_path, page_number, layout=False)
+            lines = page.splitlines()
+            cleaned_lines = []
+            for line in lines:
+                words = line.split()
+                digits = [word for word in words if any(i.isdigit() for i in word)]
+
+                # remove lines having > 40% of words as digits AND not ending with a period(.)
+                if self.remove_numeric_tables:
+                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
+                        logger.debug(f"Removing line '{line}' from {file_path}")
+                        continue
+
+                if self.remove_whitespace:
+                    line = line.strip()
+
+                cleaned_lines.append(line)
+
+            page = "\n".join(cleaned_lines)
+
+            if self.remove_empty_lines:
+                page = re.sub(r"\n\n+", "\n\n", page)
+
+            pages.append(page)
+            page_number += 1
+
+        if self.valid_languages:
+            document_text = "".join(pages)
+            if not self._validate_language(document_text):
+                logger.warning(
+                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
+                    f"been decoded in the correct text format."
+                )
+
+        if self.remove_header_footer:
+            pages, header, footer = self.find_and_remove_header_footer(
+                pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
+            )
+            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
+
+        return pages
+
+    def _extract_page(self, file_path: Path, page_number: int, layout: bool):
+        """
+        Extract a page from the pdf file at file_path.
+
+        :param file_path: path of the pdf file
+        :param page_number: page number to extract(starting from 1)
+        :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
+                       the content stream order.
+        """
+        if layout:
+            command = ["pdftotext", "-layout", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
+        else:
+            command = ["pdftotext", "-f", str(page_number), "-l", str(page_number), file_path, "-"]
+        output_page = subprocess.run(command, capture_output=True, shell=False)
+        page = output_page.stdout.decode(errors="ignore")
+        return page
+
+    def _validate_language(self, text: str):
+        """
+        Validate if the language of the text is one of valid languages.
+        """
+        try:
+            lang = langdetect.detect(text)
+        except langdetect.lang_detect_exception.LangDetectException:
+            lang = None
+
+        if lang in self.valid_languages:
+            return True
+        else:
+            return False
+
+    def _ngram(self, seq: str, n: int):
+        """
+        Return ngram (of tokens - currently splitted by whitespace)
+        :param seq: str, string from which the ngram shall be created
+        :param n: int, n of ngram
+        :return: str, ngram as string
+        """
+
+        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
+        # we add a space here and remove it after creation of the ngrams again (see below)
+        seq = seq.replace("\n", " \n")
+        seq = seq.replace("\t", " \t")
+
+        seq = seq.split(" ")
+        ngrams = (
+            " ".join(seq[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(seq) - n + 1)
+        )
+
+        return ngrams
+
+    def _allngram(self, seq: str, min_ngram: int, max_ngram: int):
+        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
+        ngrams = map(partial(self._ngram, seq), lengths)
+        res = set(chain.from_iterable(ngrams))
+        return res
+
+    def find_longest_common_ngram(self, sequences: [str], max_ngram: int = 30, min_ngram: int = 3):
+        """
+        Find the longest common ngram across different text sequences (e.g. start of pages).
+        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
+
+        :param sequences: list[str], list of strings that shall be searched for common n_grams
+        :param max_ngram: int, maximum length of ngram to consider
+        :param min_ngram: minimum length of ngram to consider
+        :return: str, common string of all sections
+        """
+
+        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
+        intersection = reduce(set.intersection, seqs_ngrams)
+
+        try:
+            longest = max(intersection, key=len)
+        except ValueError:
+            # no common sequence found
+            longest = ""
+        return longest if longest.strip() else None
+
+    def find_and_remove_header_footer(
+        self, pages: [str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
+    ):
+        """
+        Heuristic to find footers and headers across different pages by searching for the longest common string.
+        For headers we only search in the first n_chars characters (for footer: last n_chars).
+        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
+         but won't detect "Page 3 of 4" or similar.
+
+        :param pages: list of strings, one string per page
+        :param n_chars: number of first/last characters where the header/footer shall be searched in
+        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
+        :param n_last_pages_to_ignore: number of last pages to ignore
+        :return: (cleaned pages, found_header_str, found_footer_str)
+        """
+
+        # header
+        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+        found_header = self.find_longest_common_ngram(start_of_pages)
+        if found_header:
+            pages = [page.replace(found_header, "") for page in pages]
+
+        # footer
+        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
+        found_footer = self.find_longest_common_ngram(end_of_pages)
+        if found_footer:
+            pages = [page.replace(found_footer, "") for page in pages]
+        return pages, found_header, found_footer
--- a/haystack/indexing/utils.py
+++ b/haystack/indexing/utils.py
@ -4,57 +4,53 @@ from farm.data_handler.utils import http_get
 import tempfile
 import tarfile
 import zipfile
+from typing import Callable
+from haystack.indexing.file_converters.pdftotext import PDFToTextConverter

 logger = logging.getLogger(__name__)


-def write_documents_to_db(document_store, document_dir, clean_func=None, only_empty_db=False, split_paragraphs=False):
+def convert_files_to_dicts(dir_path: str, clean_func: Callable = None, split_paragraphs: bool = False) -> [dict]:
    """
-    Write all text files(.txt) in the sub-directories of the given path to the connected database.
+    Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
+    Document Store.

-    :param document_dir: path for the documents to be written to the database
+    :param dir_path: path for the documents to be written to the database
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
-    :param only_empty_db: If true, docs will only be written if db is completely empty.
-                              Useful to avoid indexing the same initial docs again and again.
+    :param split_paragraphs: split text in paragraphs.
+
    :return: None
    """
-    file_paths = Path(document_dir).glob("**/*.txt")

-    # check if db has already docs
-    if only_empty_db:
-        n_docs = document_store.get_document_count()
-        if n_docs > 0:
-            logger.info(f"Skip writing documents since DB already contains {n_docs} docs ...  "
-                        "(Disable `only_empty_db`, if you want to add docs anyway.)")
-            return None
+    file_paths = [p for p in Path(dir_path).glob("**/*")]
+    if ".pdf" in [p.suffix.lower() for p in file_paths]:
+        pdf_converter = PDFToTextConverter()
+    else:
+        pdf_converter = None

-    # read and add docs
-    docs_to_index = []
+    documents = []
    for path in file_paths:
-        with open(path) as doc:
-            text = doc.read()
-            if clean_func:
-                text = clean_func(text)
+        if path.suffix.lower() == ".txt":
+            with open(path) as doc:
+                text = doc.read()
+        elif path.suffix.lower() == ".pdf":
+            pages = pdf_converter.extract_pages(path)
+            text = "\n".join(pages)
+        else:
+            raise Exception(f"Indexing of {path.suffix} files is not currently supported.")

-            if split_paragraphs:
-                for para in text.split("\n\n"):
-                    if not para.strip():  # skip empty paragraphs
-                        continue
-                    docs_to_index.append(
-                        {
-                            "name": path.name,
-                            "text": para
-                        }
-                    )
-            else:
-                docs_to_index.append(
-                    {
-                        "name": path.name,
-                        "text": text
-                    }
-                )
-    document_store.write_documents(docs_to_index)
-    logger.info(f"Wrote {len(docs_to_index)} docs to DB")
+        if clean_func:
+            text = clean_func(text)
+
+        if split_paragraphs:
+            for para in text.split("\n\n"):
+                if not para.strip():  # skip empty paragraphs
+                    continue
+                documents.append({"name": path.name, "text": para})
+        else:
+            documents.append({"name": path.name, "text": text})
+
+    return documents


 def fetch_archive_from_http(url, output_dir, proxies=None):
@ -97,3 +93,4 @@ def fetch_archive_from_http(url, output_dir, proxies=None):
                archive.extractall(output_dir)
            # temp_file gets deleted here
        return True
+
--- a/requirements.txt
+++ b/requirements.txt
@ -9,4 +9,6 @@ elasticsearch
 elastic-apm
 tox
 coverage
+langdetect # for PDF conversions
+PyMuPDF # for PDF conversions
 # optional: sentence-transformers
--- a/test/conftest.py
+++ b/test/conftest.py
@ -1,7 +1,8 @@
 import tarfile
 import time
 import urllib.request
-from subprocess import Popen, PIPE, STDOUT
+
+from subprocess import Popen, PIPE, STDOUT, run

 import pytest

@ -19,3 +20,19 @@ def elasticsearch_fixture(elasticsearch_dir):
    thetarfile.extractall(path=elasticsearch_dir)
    es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT)
    time.sleep(30)
+
+
+@pytest.fixture(scope="session")
+def xpdf_fixture():
+    verify_installation = run(["pdftotext"], shell=True)
+    if verify_installation.returncode == 127:
+        commands = """ wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz &&
+                       tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin"""
+        run([commands], shell=True)
+
+        verify_installation = run(["pdftotext -v"], shell=True)
+        if verify_installation.returncode == 127:
+            raise Exception(
+                """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
+                 You can download for your OS from here: https://www.xpdfreader.com/download.html."""
+            )
--- a/test/samples/pdf/sample_pdf_1.pdf
+++ b/test/samples/pdf/sample_pdf_1.pdf
--- a/test/samples/pdf/sample_pdf_2.pdf
+++ b/test/samples/pdf/sample_pdf_2.pdf
--- a/test/test_db.py
+++ b/test/test_db.py
@ -2,12 +2,13 @@ from time import sleep

 from haystack.database.elasticsearch import ElasticsearchDocumentStore
 from haystack.database.sql import SQLDocumentStore
-from haystack.indexing.io import write_documents_to_db
+from haystack.indexing.utils import convert_files_to_dicts


 def test_sql_write_read():
    sql_document_store = SQLDocumentStore()
-    write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs")
+    documents = convert_files_to_dicts(dir_path="samples/docs")
+    sql_document_store.write_documents(documents)
    documents = sql_document_store.get_all_documents()
    assert len(documents) == 2
    doc = sql_document_store.get_document_by_id("1")
@ -17,7 +18,8 @@ def test_sql_write_read():

 def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
-    write_documents_to_db(document_store=document_store, document_dir="samples/docs")
+    documents = convert_files_to_dicts(dir_path="samples/docs")
+    document_store.write_documents(documents)
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    assert len(documents) == 2
--- a/test/test_imports.py
+++ b/test/test_imports.py
@ -2,7 +2,7 @@ def test_module_imports():
    from haystack import Finder
    from haystack.database.sql import SQLDocumentStore
    from haystack.indexing.cleaning import clean_wiki_text
-    from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
+    from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
    from haystack.reader.farm import FARMReader
    from haystack.reader.transformers import TransformersReader
    from haystack.retriever.tfidf import TfidfRetriever
@ -11,7 +11,7 @@ def test_module_imports():
    assert Finder is not None
    assert SQLDocumentStore is not None
    assert clean_wiki_text is not None
-    assert write_documents_to_db is not None
+    assert convert_files_to_dicts is not None
    assert fetch_archive_from_http is not None
    assert FARMReader is not None
    assert TransformersReader is not None
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -0,0 +1,52 @@
+import logging
+from pathlib import Path
+
+from haystack.indexing.file_converters.pdftotext import PDFToTextConverter
+
+logger = logging.getLogger(__name__)
+
+
+def test_extract_pages(xpdf_fixture):
+    converter = PDFToTextConverter()
+    pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    assert len(pages) == 4  # the sample PDF file has four pages.
+    assert pages[0] != ""  # the page 1 of PDF contains text.
+    assert pages[2] == ""  # the page 3 of PDF file is empty.
+
+
+def test_table_removal(xpdf_fixture):
+    converter = PDFToTextConverter(remove_numeric_tables=True)
+    pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+
+    # assert numeric rows are removed from the table.
+    assert "324" not in pages[0]
+    assert "54x growth" not in pages[0]
+    assert "$54.35" not in pages[0]
+
+    # assert text is retained from the document.
+    assert "Adobe Systems made the PDF specification available free of charge in 1993." in pages[0]
+
+
+def test_language_validation(xpdf_fixture, caplog):
+    converter = PDFToTextConverter(valid_languages=["en"])
+    pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
+
+    converter = PDFToTextConverter(valid_languages=["de"])
+    pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
+    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
+
+
+def test_header_footer_removal(xpdf_fixture):
+    converter = PDFToTextConverter(remove_header_footer=True)
+    converter_no_removal = PDFToTextConverter(remove_header_footer=False)
+
+    pages1 = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    pages2 = converter_no_removal.extract_pages(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
+    for p1, p2 in zip(pages1, pages2):
+        assert p2 == p2
+
+    pages = converter.extract_pages(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
+    for page in pages:
+        assert "header" not in page
+        assert "footer" not in page
--- a/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
+++ b/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb
@ -42,7 +42,7 @@
   "source": [
    "from haystack import Finder\n",
    "from haystack.indexing.cleaning import clean_wiki_text\n",
-    "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
+    "from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
    "from haystack.reader.farm import FARMReader\n",
    "from haystack.reader.transformers import TransformersReader\n",
    "from haystack.utils import print_answers"
@ -164,11 +164,13 @@
    "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
    "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
    "\n",
-    "\n",
-    "# Now, let's write the docs to our DB.\n",
+    "# Convert files to dicts\n",
    "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
    "# It must take a str as input, and return a str.\n",
-    "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
+    "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
+    "\n",
+    "# Now, let's write the dicts containing documents to our DB.\n",
+    "document_store.write_documents(dicts)"
   ]
  },
  {
--- a/tutorials/Tutorial1_Basic_QA_Pipeline.py
+++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py
@ -16,7 +16,7 @@ import time
 from haystack import Finder
 from haystack.database.elasticsearch import ElasticsearchDocumentStore
 from haystack.indexing.cleaning import clean_wiki_text
-from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
+from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
 from haystack.reader.farm import FARMReader
 from haystack.reader.transformers import TransformersReader
 from haystack.utils import print_answers
@ -69,10 +69,14 @@ doc_dir = "data/article_txt_got"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

-# Now, let's write the docs to our DB.
+# convert files to dicts containing documents that can be indexed to our datastore
+dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
 # It must take a str as input, and return a str.
-write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)
+
+# Now, let's write the docs to our DB.
+document_store.write_documents(dicts)
+

 # ## Initalize Retriever, Reader,  & Finder
 #
--- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb
+++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb
@ -35,7 +35,7 @@
   "source": [
    "from haystack import Finder\n",
    "from haystack.indexing.cleaning import clean_wiki_text\n",
-    "from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http\n",
+    "from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http\n",
    "from haystack.reader.farm import FARMReader\n",
    "from haystack.reader.transformers import TransformersReader\n",
    "from haystack.utils import print_answers"
@ -110,11 +110,13 @@
    "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip\"\n",
    "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
    "\n",
-    "\n",
-    "# Now, let's write the docs to our DB.\n",
+    "# convert files to dicts containing documents that can be indexed to our datastore\n",
    "# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)\n",
    "# It must take a str as input, and return a str.\n",
-    "write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)"
+    "dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)\n",
+    "\n",
+    "# Now, let's write the docs to our DB.\n",
+    "document_store.write_documents(dicts)"
   ]
  },
  {
--- a/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py
+++ b/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.py
@ -11,7 +11,7 @@ from haystack import Finder
 from haystack.database.memory import InMemoryDocumentStore
 from haystack.database.sql import SQLDocumentStore
 from haystack.indexing.cleaning import clean_wiki_text
-from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
+from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
 from haystack.reader.farm import FARMReader
 from haystack.reader.transformers import TransformersReader
 from haystack.retriever.tfidf import TfidfRetriever
@ -37,11 +37,13 @@ doc_dir = "data/article_txt_got"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

-# Now, let's write the docs to our DB.
+# convert files to dicts containing documents that can be indexed to our datastore
+dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
 # It must take a str as input, and return a str.
-write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)

+# Now, let's write the docs to our DB.
+document_store.write_documents(dicts)


 # ## Initalize Retriever, Reader,  & Finder
--- a/tutorials/Tutorial5_Evaluation.ipynb
+++ b/tutorials/Tutorial5_Evaluation.ipynb
@ -1635,7 +1635,7 @@
   },
   "source": [
    "\n",
-    "from haystack.indexing.io import fetch_archive_from_http\n",
+    "from haystack.indexing.utils import fetch_archive_from_http\n",
    "\n",
    "# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents\n",
    "doc_dir = \"../data/nq\"\n",
--- a/tutorials/Tutorial5_Evaluation.py
+++ b/tutorials/Tutorial5_Evaluation.py
@ -1,5 +1,5 @@
 from haystack.database.elasticsearch import ElasticsearchDocumentStore
-from haystack.indexing.io import fetch_archive_from_http
+from haystack.indexing.utils import fetch_archive_from_http
 from haystack.retriever.elasticsearch import ElasticsearchRetriever
 from haystack.reader.farm import FARMReader
 from haystack.finder import Finder