diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 7d59a4e30..5a889b818 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -1,3 +1,22 @@ + +# Module base + + +## BasePreProcessor Objects + +```python +class BasePreProcessor() +``` + + +#### process + +```python + | process(document: dict) -> List[dict] +``` + +Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. + # Module preprocessor @@ -12,7 +31,7 @@ class PreProcessor(BasePreProcessor) #### \_\_init\_\_ ```python - | __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_stride: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) + | __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) ``` **Arguments**: @@ -26,10 +45,12 @@ or similar. - `split_by`: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. - `split_length`: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> "sentence", then each output document will have 10 sentences. -- `split_stride`: Length of striding window over the splits. For example, if split_by -> `word`, -split_length -> 5 & split_stride -> 2, then the splits would be like: +- `split_overlap`: Word overlap between two adjacent documents after a split. +Setting this to a positive number essentially enables the sliding window approach. +For example, if split_by -> `word`, +split_length -> 5 & split_overlap -> 2, then the splits would be like: [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. -Set the value to None to disable striding behaviour. +Set the value to None to ensure there is no overlap among the documents after splitting. - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. @@ -52,22 +73,9 @@ and empty lines. Its exact functionality is defined by the parameters passed int ``` Perform document splitting on a single document. This method can split on different units, at different lengths, -with different strides. It can also respect sectence boundaries. Its exact functionality is defined by +with different strides. It can also respect sentence boundaries. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. - -# Module cleaning - - -#### clean\_wiki\_text - -```python -clean_wiki_text(text: str) -> str -``` - -Clean wikipedia text by removing multiple new lines, removing extremely short lines, -adding paragraph breaks and removing empty paragraphs - # Module utils @@ -154,22 +162,16 @@ Fetch an archive (zip or tar.gz) from a url via http and extract content to an o bool if anything got fetched - -# Module base + +# Module cleaning - -## BasePreProcessor Objects + +#### clean\_wiki\_text ```python -class BasePreProcessor() +clean_wiki_text(text: str) -> str ``` - -#### process - -```python - | process(document: dict) -> List[dict] -``` - -Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. +Clean wikipedia text by removing multiple new lines, removing extremely short lines, +adding paragraph breaks and removing empty paragraphs diff --git a/docs/_src/api/api/pydoc-markdown-document-store.yml b/docs/_src/api/api/pydoc-markdown-document-store.yml index e359c5293..863e7a00d 100644 --- a/docs/_src/api/api/pydoc-markdown-document-store.yml +++ b/docs/_src/api/api/pydoc-markdown-document-store.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/document_store] + modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/api/api/pydoc-markdown-file-converters.yml b/docs/_src/api/api/pydoc-markdown-file-converters.yml index 2ec184ed6..574b7be71 100644 --- a/docs/_src/api/api/pydoc-markdown-file-converters.yml +++ b/docs/_src/api/api/pydoc-markdown-file-converters.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/file_converter] + modules: ['base', 'txt', 'docx', 'tika', 'pdf'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/api/api/pydoc-markdown-generator.yml b/docs/_src/api/api/pydoc-markdown-generator.yml index 8774bf8ca..ce6599a04 100644 --- a/docs/_src/api/api/pydoc-markdown-generator.yml +++ b/docs/_src/api/api/pydoc-markdown-generator.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/generator] + modules: [ 'base', 'transformers'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/api/api/pydoc-markdown-preprocessor.yml b/docs/_src/api/api/pydoc-markdown-preprocessor.yml index 973e6dd92..0f21f1da6 100644 --- a/docs/_src/api/api/pydoc-markdown-preprocessor.yml +++ b/docs/_src/api/api/pydoc-markdown-preprocessor.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/preprocessor] + modules: ['base', 'preprocessor', 'utils', 'cleaning'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/api/api/pydoc-markdown-reader.yml b/docs/_src/api/api/pydoc-markdown-reader.yml index 5cc216326..59a77a836 100644 --- a/docs/_src/api/api/pydoc-markdown-reader.yml +++ b/docs/_src/api/api/pydoc-markdown-reader.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/reader] + modules: ['base', 'farm', 'transformers'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/api/api/pydoc-markdown-retriever.yml b/docs/_src/api/api/pydoc-markdown-retriever.yml index 4ef2387d7..add60eb26 100644 --- a/docs/_src/api/api/pydoc-markdown-retriever.yml +++ b/docs/_src/api/api/pydoc-markdown-retriever.yml @@ -1,6 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/retriever] + modules: ['base', 'sparse', 'dense'] ignore_when_discovered: ['__init__'] processor: - type: filter diff --git a/docs/_src/usage/usage/preprocessing.md b/docs/_src/usage/usage/preprocessing.md index ee49db7ed..a4c7b3ed5 100644 --- a/docs/_src/usage/usage/preprocessing.md +++ b/docs/_src/usage/usage/preprocessing.md @@ -94,12 +94,15 @@ For suggestions on how best to split your documents, see [Optimization](/docs/la ```python doc = converter.convert(file_path=file, meta=None) -processor = PreProcessor(clean_empty_lines=True, - clean_whitespace=True, - clean_header_footer=True, - split_by="word", - split_length=200, - split_respect_sentence_boundary=True) +processor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=True, + split_by="word", + split_length=200, + split_respect_sentence_boundary=True, + split_overlap=0 +) docs = processor.process(d) ``` @@ -109,3 +112,5 @@ docs = processor.process(d) * `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'` * `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document * `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences +* `split_overlap` sets the amount of overlap between two adjacent documents after a split. Setting this to a positive number essentially enables the sliding window approach. + diff --git a/haystack/preprocessor/preprocessor.py b/haystack/preprocessor/preprocessor.py index 97b15934e..ed43936f3 100644 --- a/haystack/preprocessor/preprocessor.py +++ b/haystack/preprocessor/preprocessor.py @@ -21,7 +21,7 @@ class PreProcessor(BasePreProcessor): clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, - split_stride: Optional[int] = None, + split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, ): """ @@ -34,10 +34,12 @@ class PreProcessor(BasePreProcessor): :param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. :param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> "sentence", then each output document will have 10 sentences. - :param split_stride: Length of striding window over the splits. For example, if split_by -> `word`, - split_length -> 5 & split_stride -> 2, then the splits would be like: - [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. - Set the value to None to disable striding behaviour. + :param split_overlap: Word overlap between two adjacent documents after a split. + Setting this to a positive number essentially enables the sliding window approach. + For example, if split_by -> `word`, + split_length -> 5 & split_overlap -> 2, then the splits would be like: + [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. + Set the value to None to ensure there is no overlap among the documents after splitting. :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. @@ -48,7 +50,7 @@ class PreProcessor(BasePreProcessor): self.clean_empty_lines = clean_empty_lines self.split_by = split_by self.split_length = split_length - self.split_stride = split_stride + self.split_overlap = split_overlap self.split_respect_sentence_boundary = split_respect_sentence_boundary def clean(self, document: dict) -> dict: @@ -79,7 +81,7 @@ class PreProcessor(BasePreProcessor): def split(self, document: dict) -> List[dict]: """Perform document splitting on a single document. This method can split on different units, at different lengths, - with different strides. It can also respect sectence boundaries. Its exact functionality is defined by + with different strides. It can also respect sentence boundaries. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. """ if not self.split_by: @@ -107,12 +109,12 @@ class PreProcessor(BasePreProcessor): if word_count + current_word_count > self.split_length: list_splits.append(current_slice) #Enable split_stride with split_by='word' while respecting sentence boundaries. - if self.split_stride: + if self.split_overlap: overlap = [] w_count = 0 for s in current_slice[::-1]: sen_len = len(s.split(" ")) - if w_count < self.split_stride: + if w_count < self.split_overlap: overlap.append(s) w_count += sen_len else: @@ -139,8 +141,8 @@ class PreProcessor(BasePreProcessor): raise NotImplementedError("PreProcessor only supports 'passage' or 'sentence' split_by options.") # concatenate individual elements based on split_length & split_stride - if self.split_stride: - segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_stride) + if self.split_overlap: + segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_overlap) else: segments = windowed(elements, n=self.split_length, step=self.split_length) text_splits = [] diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py index 4e0220642..e04fbef89 100644 --- a/test/test_preprocessor.py +++ b/test/test_preprocessor.py @@ -21,12 +21,12 @@ in the sentence. @pytest.mark.tika def test_preprocess_sentence_split(): document = {"text": TEXT} - preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="sentence") + preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence") documents = preprocessor.process(document) assert len(documents) == 15 preprocessor = PreProcessor( - split_length=10, split_stride=0, split_by="sentence" + split_length=10, split_overlap=0, split_by="sentence" ) documents = preprocessor.process(document) assert len(documents) == 2 @@ -35,11 +35,11 @@ def test_preprocess_sentence_split(): @pytest.mark.tika def test_preprocess_word_split(): document = {"text": TEXT} - preprocessor = PreProcessor(split_length=10, split_stride=0, split_by="word", split_respect_sentence_boundary=False) + preprocessor = PreProcessor(split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False) documents = preprocessor.process(document) assert len(documents) == 11 - preprocessor = PreProcessor(split_length=15, split_stride=0, split_by="word", split_respect_sentence_boundary=True) + preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True) documents = preprocessor.process(document) for i,doc in enumerate(documents): if i == 0: @@ -47,7 +47,7 @@ def test_preprocess_word_split(): assert len(doc["text"].split(" ")) <= 15 or doc["text"].startswith("This is to trick") assert len(documents) == 8 - preprocessor = PreProcessor(split_length=40, split_stride=10, split_by="word", split_respect_sentence_boundary=True) + preprocessor = PreProcessor(split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True) documents = preprocessor.process(document) assert len(documents) == 5 @@ -55,11 +55,11 @@ def test_preprocess_word_split(): @pytest.mark.tika def test_preprocess_passage_split(): document = {"text": TEXT} - preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage", split_respect_sentence_boundary=False) + preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False) documents = preprocessor.process(document) assert len(documents) == 3 - preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage", split_respect_sentence_boundary=False) + preprocessor = PreProcessor(split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False) documents = preprocessor.process(document) assert len(documents) == 2 diff --git a/tutorials/Tutorial7_RAG_Generator.ipynb b/tutorials/Tutorial7_RAG_Generator.ipynb index 29512c3eb..4abb7ac89 100644 --- a/tutorials/Tutorial7_RAG_Generator.ipynb +++ b/tutorials/Tutorial7_RAG_Generator.ipynb @@ -5,6 +5,8 @@ "source": [ "# Generative QA with \"Retrieval-Augmented Generation\"\n", "\n", + "EXECUTABLE VERSION: [colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb)\n", + "\n", "While extractive QA highlights the span of text that answers a query,\n", "generative QA can return a novel text answer that it has composed.\n", "In this tutorial, you will learn how to set up a generative system using the\n", diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb new file mode 100644 index 000000000..e9ed76c3d --- /dev/null +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -0,0 +1,504 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Preprocessing\n", + "\n", + "EXECUTABLE VERSION: [colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial8_Preprocessing.ipynb)\n", + "\n", + "Haystack includes a suite of tools to extract text from different file types, normalize white space\n", + "and split text into smaller pieces to optimize retrieval.\n", + "These data preprocessing steps can have a big impact on the systems performance and effective handling of data is key to getting the most out of Haystack." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Ultimately, Haystack expects data to be provided as a list documents in the following dictionary format:\n", + "``` python\n", + "docs = [\n", + " {\n", + " 'text': DOCUMENT_TEXT_HERE,\n", + " 'meta': {'name': DOCUMENT_NAME, ...}\n", + " }, ...\n", + "]\n", + "```" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "This tutorial will show you all the tools that Haystack provides to help you cast your data into this format." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 26, + "outputs": [], + "source": [ + "# Let's start by installing Haystack\n", + "\n", + "# Install the latest release of Haystack in your own environment\n", + "#! pip install farm-haystack\n", + "\n", + "# Install the latest master of Haystack\n", + "!pip install git+https://github.com/deepset-ai/haystack.git\n", + "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n", + "!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.02.tar.gz\n", + "!tar -xvf xpdf-tools-linux-4.02.tar.gz && sudo cp xpdf-tools-linux-4.02/bin64/pdftotext /usr/local/bin" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/06/2021 14:49:14 - INFO - faiss - Loading faiss with AVX2 support.\n", + "01/06/2021 14:49:14 - INFO - faiss - Loading faiss.\n" + ] + } + ], + "source": [ + "# Here are the imports we need\n", + "\n", + "from haystack.file_converter.txt import TextConverter\n", + "from haystack.file_converter.pdf import PDFToTextConverter\n", + "from haystack.file_converter.docx import DocxToTextConverter\n", + "\n", + "from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http\n", + "from haystack.preprocessor.preprocessor import PreProcessor" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 29, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/05/2021 12:02:30 - INFO - haystack.preprocessor.utils - Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip to `data/preprocessing_tutorial`\n", + "100%|██████████| 595119/595119 [00:00<00:00, 5299765.39B/s]\n" + ] + }, + { + "data": { + "text/plain": "True" + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This fetches some sample files to work with\n", + "\n", + "doc_dir = \"data/preprocessing_tutorial\"\n", + "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip\"\n", + "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Converters\n", + "\n", + "Haystack's converter classes are designed to help you turn files on your computer into the documents\n", + "that can be processed by the Haystack pipeline.\n", + "There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "# Here are some examples of how you would use file converters\n", + "\n", + "converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", + "doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)\n", + "\n", + "converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", + "doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)\n", + "\n", + "converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n", + "doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/06/2021 14:51:06 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/heavy_metal.docx\n", + "01/06/2021 14:51:06 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/bert.pdf\n", + "01/06/2021 14:51:07 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/classics.txt\n" + ] + } + ], + "source": [ + "# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.\n", + "\n", + "all_docs = convert_files_to_dicts(dir_path=\"data/preprocessing_tutorial\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## PreProcessor\n", + "\n", + "The PreProcessor class is designed to help you clean text and split text into sensible units.\n", + "File splitting can have a very significant impact on the system's performance and is absolutely mandatory for Dense Passage Retrieval models.\n", + "In general, we recommend you split the text from your files into small documents of around 100 words for dense retrieval methods\n", + "and no more than 10,000 words for sparse methods.\n", + "Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)\n", + "and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n_docs_input: 1\n", + "n_docs_output: 51\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/branden/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "# This is a default usage of the PreProcessor.\n", + "# Here, it performs cleaning of consecutive whitespaces\n", + "# and splits a single large document into smaller documents.\n", + "# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n", + "# Note how the single document passed into the document gets split into 5 smaller documents\n", + "\n", + "preprocessor = PreProcessor(\n", + " clean_empty_lines=True,\n", + " clean_whitespace=True,\n", + " clean_header_footer=False,\n", + " split_by=\"word\",\n", + " split_length=100,\n", + " split_respect_sentence_boundary=True\n", + ")\n", + "docs_default = preprocessor.process(doc_txt)\n", + "print(f\"n_docs_input: 1\\nn_docs_output: {len(docs_default)}\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Cleaning\n", + "\n", + "- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines\n", + "- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text\n", + "- `clean_header_footer` will remove any long header or footer texts that are repeated on each page" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Splitting\n", + "By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end\n", + "midway through a sentence.\n", + "This will help reduce the possibility of answer phrases being split between two documents.\n", + "This feature can be turned off by setting `split_respect_sentence_boundary=False`." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RESPECTING SENTENCE BOUNDARY\n", + "End of document: \"...cornerstone of a typical elite European education.\"\n", + "\n", + "NOT RESPECTING SENTENCE BOUNDARY\n", + "End of document: \"...on. In England, for instance, Oxford and Cambridge\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/branden/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "# Not respecting sentence boundary vs respecting sentence boundary\n", + "\n", + "preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)\n", + "docs_nrsb = preprocessor_nrsb.process(doc_txt)\n", + "\n", + "print(\"RESPECTING SENTENCE BOUNDARY\")\n", + "end_text = docs_default[0][\"text\"][-50:]\n", + "print(\"End of document: \\\"...\" + end_text + \"\\\"\")\n", + "print()\n", + "print(\"NOT RESPECTING SENTENCE BOUNDARY\")\n", + "end_text_nrsb = docs_nrsb[0][\"text\"][-50:]\n", + "print(\"End of document: \\\"...\" + end_text_nrsb + \"\\\"\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "A commonly used strategy to split long documents, especially in the field of Question Answering,\n", + "is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:\n", + "\n", + "- doc1 = words[0:10]\n", + "- doc2 = words[7:17]\n", + "- doc3 = words[14:24]\n", + "- ...\n", + "\n", + "You can use this strategy by following the code below." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "# Sliding window approach\n", + "\n", + "preprocessor_sliding_window = PreProcessor(\n", + " split_overlap=3,\n", + " split_length=10,\n", + " split_respect_sentence_boundary=False\n", + ")\n", + "docs_sliding_window = preprocessor_sliding_window.process(doc_txt)\n", + "\n", + "doc1 = docs_sliding_window[0][\"text\"][:200]\n", + "doc2 = docs_sliding_window[1][\"text\"][:100]\n", + "doc3 = docs_sliding_window[2][\"text\"][:100]\n", + "\n", + "print(\"Document 1: \\\"\" + doc1 + \"...\\\"\")\n", + "print(\"Document 2: \\\"\" + doc2 + \"...\\\"\")\n", + "print(\"Document 3: \\\"\" + doc3 + \"...\\\"\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 12, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1: \"Classics or classical studies is the study of classical antiquity,...\"\n", + "Document 2: \"of classical antiquity, and in the Western world traditionally refers...\"\n", + "Document 3: \"world traditionally refers to the study of Classical Greek and...\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/branden/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Bringing it all together" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/06/2021 14:56:12 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/heavy_metal.docx\n", + "01/06/2021 14:56:12 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/bert.pdf\n", + "01/06/2021 14:56:12 - INFO - haystack.preprocessor.utils - Converting data/preprocessing_tutorial/classics.txt\n", + "[nltk_data] Downloading package punkt to /home/branden/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "n_files_input: 3\n", + "n_docs_output: 150\n" + ] + } + ], + "source": [ + "all_docs = convert_files_to_dicts(dir_path=\"data/preprocessing_tutorial\")\n", + "preprocessor = PreProcessor(\n", + " clean_empty_lines=True,\n", + " clean_whitespace=True,\n", + " clean_header_footer=False,\n", + " split_by=\"word\",\n", + " split_length=100,\n", + " split_respect_sentence_boundary=True\n", + ")\n", + "nested_docs = [preprocessor.process(d) for d in all_docs]\n", + "docs = [d for x in nested_docs for d in x]\n", + "\n", + "print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tutorials/Tutorial8_Preprocessing.py b/tutorials/Tutorial8_Preprocessing.py new file mode 100644 index 000000000..c3fe7e374 --- /dev/null +++ b/tutorials/Tutorial8_Preprocessing.py @@ -0,0 +1,142 @@ +""" +Preprocessing + +Haystack includes a suite of tools to extract text from different file types, normalize white space +and split text into smaller pieces to optimize retrieval. +These data preprocessing steps can have a big impact on the systems performance and effective handling of data is key to getting the most out of Haystack. + +Ultimately, Haystack pipelines expect data to be provided as a list documents in the following dictionary format: + +docs = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] + +This tutorial will show you all the tools that Haystack provides to help you cast your data into the right format. +""" + +# Here are the imports we need + +from haystack.file_converter.txt import TextConverter +from haystack.file_converter.pdf import PDFToTextConverter +from haystack.file_converter.docx import DocxToTextConverter + +from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http +from haystack.preprocessor.preprocessor import PreProcessor + +# This fetches some sample files to work with + +doc_dir = "data/preprocessing_tutorial" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +""" +## Converters + +Haystack's converter classes are designed to help you turn files on your computer into the documents +that can be processed by the Haystack pipeline. +There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika. +""" + +# Here are some examples of how you would use file converters + +converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None) + +converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None) + +converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None) + +# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory. + +all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial") + +""" + +## PreProcessor + +The PreProcessor class is designed to help you clean text and split text into sensible units. +File splitting can have a very significant impact on the system's performance. +Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd) +and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details. +""" + + +# This is a default usage of the PreProcessor. +# Here, it performs cleaning of consecutive whitespaces +# and splits a single large document into smaller documents. +# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences +# Note how the single document passed into the document gets split into 5 smaller documents + +preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=1000, + split_respect_sentence_boundary=True +) +docs_default = preprocessor.process(doc_txt) +print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}") + +""" +## Cleaning + +- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines +- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text +- `clean_header_footer` will remove any long header or footer texts that are repeated on each page + +## Splitting +By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end +midway through a sentence. +This will help reduce the possibility of answer phrases being split between two documents. +This feature can be turned off by setting `split_respect_sentence_boundary=False`. +""" + +# Not respecting sentence boundary vs respecting sentence boundary + +preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False) +docs_nrsb = preprocessor_nrsb.process(doc_txt) + +print("RESPECTING SENTENCE BOUNDARY") +end_text = docs_default[0]["text"][-50:] +print("End of document: \"..." + end_text + "\"") +print() +print("NOT RESPECTING SENTENCE BOUNDARY") +end_text_nrsb = docs_nrsb[0]["text"][-50:] +print("End of document: \"..." + end_text_nrsb + "\"") + +""" +A commonly used strategy to split long documents, especially in the field of Question Answering, +is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this: + +- doc1 = words[0:10] +- doc2 = words[7:17] +- doc3 = words[14:24] +- ... + +You can use this strategy by following the code below. +""" + +# Sliding window approach + +preprocessor_sliding_window = PreProcessor( + split_overlap=3, + split_length=10, + split_respect_sentence_boundary=False +) +docs_sliding_window = preprocessor_sliding_window.process(doc_txt) + +doc1 = docs_sliding_window[0]["text"][:200] +doc2 = docs_sliding_window[1]["text"][:100] +doc3 = docs_sliding_window[2]["text"][:100] + +print("Document 1: \"" + doc1 + "...\"") +print("Document 2: \"" + doc2 + "...\"") +print("Document 3: \"" + doc3 + "...\"") + +