haystack/test/nodes/test_preprocessor.py

import sys
from pathlib import Path
import os

import pytest

from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor

from ..conftest import SAMPLES_PATH


NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"


TEXT = """
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.\f

This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.

This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation\f like Dr.
in the sentence.
"""

LEGAL_TEXT_PT = """
A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
bens imóveis, é norma especial e posterior ao Código de Defesa do
Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
25/8/2020).

A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
denúncia contra o Senador ou Deputado, por crime ocorrido após a
diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
por iniciativa de partido político nela representado e pelo voto da maioria de
seus membros, poderá, até a decisão final, sustar o andamento da ação”.
Vale ressaltar, contudo, que existem, antes do encaminhamento ao
Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
redação final aprovada. O projeto aprovado será encaminhado em autógrafos
ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
do RICD e arts. 328 a 331 do RISF.
"""


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
        language="en",
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        language="ca",
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
def test_preprocess_sentence_split_custom_models(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=LEGAL_TEXT_PT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        language="pt",
        tokenizer_model_folder=NLTK_TEST_MODELS,
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


def test_preprocess_word_split():
    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == 11

    preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    for i, doc in enumerate(documents):
        if i == 0:
            assert len(doc.content.split(" ")) == 14
        assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
    assert len(documents) == 8

    preprocessor = PreProcessor(
        split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
    )
    documents = preprocessor.process(document)
    assert len(documents) == 5

    preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    assert len(documents) == 15


@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
def test_preprocess_passage_split(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(
        file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
    )  # file contains header/footer

    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
    documents = preprocessor.process(document)

    assert len(documents) == 1

    assert "This is a header." not in documents[0].content
    assert "footer" not in documents[0].content


def test_remove_substrings():
    document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")

    # check that the file contains the substrings we are about to remove
    assert "This is a header." in document.content
    assert "wiki" in document.content
    assert "🪲" in document.content
    assert "whitespace" in document.content
    assert "✨" in document.content

    preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
    documents = preprocessor.process(document)

    assert "This is a header." not in documents[0].content
    assert "wiki" not in documents[0].content
    assert "🪲" not in documents[0].content
    assert "whitespace" in documents[0].content
    assert "✨" in documents[0].content


def test_id_hash_keys_from_pipeline_params():
    document_1 = Document(content="This is a document.", meta={"key": "a"})
    document_2 = Document(content="This is a document.", meta={"key": "b"})
    assert document_1.id == document_2.id

    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
    documents = output["documents"]
    unique_ids = set(d.id for d in documents)

    assert len(documents) == 4
    assert len(unique_ids) == 4


# test_input is a tuple consisting of the parameters for split_length, split_overlap and split_respect_sentence_boundary
# and the expected index in the output list of Documents where the page number changes from 1 to 2
@pytest.mark.parametrize("test_input", [(10, 0, True, 5), (10, 0, False, 4), (10, 5, True, 6), (10, 5, False, 7)])
def test_page_number_extraction(test_input):
    split_length, overlap, resp_sent_boundary, exp_doc_index = test_input
    preprocessor = PreProcessor(
        add_page_number=True,
        split_by="word",
        split_length=split_length,
        split_overlap=overlap,
        split_respect_sentence_boundary=resp_sent_boundary,
    )
    document = Document(content=TEXT)
    documents = preprocessor.process(document)
    for idx, doc in enumerate(documents):
        if idx < exp_doc_index:
            assert doc.meta["page"] == 1
        else:
            assert doc.meta["page"] == 2


def test_page_number_extraction_on_empty_pages():
    """
    Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
    issues when mapping results back to the original document.
    """
    preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
    text_page_one = "This is a text on page one."
    text_page_three = "This is a text on page three."
    # this is what we get from PDFToTextConverter in case of an "empty" page
    document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
    document = Document(content=document_with_empty_pages)

    documents = preprocessor.process(document)

    assert documents[0].meta["page"] == 1
    assert documents[1].meta["page"] == 3

    # verify the placeholder for the empty page has been removed
    assert documents[0].content.strip() == text_page_one
    assert documents[1].content.strip() == text_page_three


def test_substitute_page_break():
    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
    # sentences should not be replaced.
    result = PreProcessor._substitute_page_breaks(TEXT)
    assert result[211:221] == "[NEW_PAGE]"
    assert result[654] == "\f"
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								import sys
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								from pathlib import Path
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								import os
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								import pytest
-												Fix PreProcessor test (#2290)

* Adding Document import, missing from recent PR

* Fix mypy signature warning too

* reduce diff to minimum

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-09 13:46:47 +01:00
+								from haystack import Document
-												Improve dependency management (#1994)

* Fist attempt at using setup.cfg for dependency management

* Trying the new package on the CI and in Docker too

* Add composite extras_require

* Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports

* Fix bug on class import and rephrase error message

* Introduce typing for optional modules and add type: ignore in sparse.py

* Include importlib_metadata backport for py3.7

* Add colab group to extra_requires

* Fix pillow version

* Fix grpcio

* Separate out the crawler as another extra

* Make paths relative in rest_api and ui

* Update the test matrix in the CI

* Add try catch statements around the optional imports too to account for direct imports

* Never mix direct deps with self-references and add ES deps to the base install

* Refactor several paths in tests to make them insensitive to the execution path

* Include tstadel review and re-introduce Milvus1 in the tests suite, to fix

* Wrap pdf conversion utils into safe_import

* Update some tutorials and rever Milvus1 as default for now, see #2067

* Fix mypy config


Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-01-26 18:12:55 +01:00
+								from haystack.nodes.file_converter.pdf import PDFToTextConverter
 								from haystack.nodes.preprocessor.preprocessor import PreProcessor
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
-												[CI refactoring] Categorize tests into folders (#2554)

* Categorize tests into folders

* Fix linux_ci.yml and an import

* Wrong path
											
										
										
											2022-05-17 10:55:53 +02:00
+								from ..conftest import SAMPLES_PATH
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
 								NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								TEXT = """
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
 								paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.\f
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
 								paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation\f like Dr.
 								in the sentence.
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								"""
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								LEGAL_TEXT_PT = """
 								A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
 								bens imóveis, é norma especial e posterior ao Código de Defesa do
 								Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
 								devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
 								da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
 /8/2020).
 								A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
 								ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
 								denúncia contra o Senador ou Deputado, por crime ocorrido após a
 								diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
 								por iniciativa de partido político nela representado e pelo voto da maioria de
 								seus membros, poderá, até a decisão final, sustar o andamento da ação”.
 								Vale ressaltar, contudo, que existem, antes do encaminhamento ao
 								Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
 								com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
 								quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
 								redação final aprovada. O projeto aprovado será encaminhado em autógrafos
 								ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
 								do RICD e arts. 328 a 331 do RISF.
 								"""
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=TEXT)
 								    preprocessor = PreProcessor(
 								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
 								        language="en",
 								    )
 								    documents = preprocessor.process(document)
 								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=TEXT)
 								    preprocessor = PreProcessor(
 								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        language="ca",
 								    )
 								    documents = preprocessor.process(document)
 								    assert len(documents) == expected_documents_count
 								@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
 								def test_preprocess_sentence_split_custom_models(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=LEGAL_TEXT_PT)
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        language="pt",
 								        tokenizer_model_folder=NLTK_TEST_MODELS,
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    )
 								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
 								def test_preprocess_word_split():
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
 								        split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
 								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
 								    assert len(documents) == 11
-												Create Preprocessing Tutorial (#706)

* WIP: First version of preprocessing tutorial

* stride renamed overlap, ipynb and py files created

* rename split_stride in test

* Update preprocessor api documentation

* define order for markdown files

* define order of modules in api docs

* Add colab links

* Incorporate review feedback

Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com>
											
										
										
											2021-01-06 15:54:05 +01:00
+								    preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    for i, doc in enumerate(documents):
-												Add needed whitespace before sentence start (#582)


											
										
										
											2020-11-13 14:14:24 +01:00
+								        if i == 0:
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								            assert len(doc.content.split(" ")) == 14
 								        assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
-												Add needed whitespace before sentence start (#582)


											
										
										
											2020-11-13 14:14:24 +01:00
+								    assert len(documents) == 8
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
 								        split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
 								    )
-												Redone: Fix concatenation of sentences in PreProcessor. Add stride for word-based splits with sentence boundaries (#641)

* Update preprocessor.py

Concatenation of sentences done correctly. Stride functionality enabled for splitting by words while respecting sentence boundaries.

* Simplify code, add test

Co-authored-by: Krak91 <45461739+Krak91@users.noreply.github.com>
											
										
										
											2020-12-09 16:12:36 +01:00
+								    documents = preprocessor.process(document)
 								    assert len(documents) == 5
-												Improve preprocessing and adding of eval data (#780)

* Remove empty document when splitting text

* Move error message of problematic ids to a highler level
											
										
										
											2021-02-01 17:08:27 +01:00
+								    preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 								    documents = preprocessor.process(document)
 								    assert len(documents) == 15
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
 								def test_preprocess_passage_split(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								def test_clean_header_footer():
 								    converter = PDFToTextConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(
 								        file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
 								    )  # file contains header/footer
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
 								    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
 								    documents = preprocessor.process(document)
 								    assert len(documents) == 1
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." not in documents[0].content
 								    assert "footer" not in documents[0].content
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								def test_remove_substrings():
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								    # check that the file contains the substrings we are about to remove
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." in document.content
 								    assert "wiki" in document.content
 								    assert "🪲" in document.content
 								    assert "whitespace" in document.content
 								    assert "✨" in document.content
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								    preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
 								    documents = preprocessor.process(document)
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." not in documents[0].content
 								    assert "wiki" not in documents[0].content
 								    assert "🪲" not in documents[0].content
 								    assert "whitespace" in documents[0].content
 								    assert "✨" in documents[0].content
-												Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-24 09:55:09 +02:00
 								def test_id_hash_keys_from_pipeline_params():
 								    document_1 = Document(content="This is a document.", meta={"key": "a"})
 								    document_2 = Document(content="This is a document.", meta={"key": "b"})
 								    assert document_1.id == document_2.id
 								    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
 								    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
 								    documents = output["documents"]
 								    unique_ids = set(d.id for d in documents)
 								    assert len(documents) == 4
 								    assert len(unique_ids) == 4
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
 								# test_input is a tuple consisting of the parameters for split_length, split_overlap and split_respect_sentence_boundary
 								# and the expected index in the output list of Documents where the page number changes from 1 to 2
 								@pytest.mark.parametrize("test_input", [(10, 0, True, 5), (10, 0, False, 4), (10, 5, True, 6), (10, 5, False, 7)])
 								def test_page_number_extraction(test_input):
 								    split_length, overlap, resp_sent_boundary, exp_doc_index = test_input
 								    preprocessor = PreProcessor(
 								        add_page_number=True,
 								        split_by="word",
 								        split_length=split_length,
 								        split_overlap=overlap,
 								        split_respect_sentence_boundary=resp_sent_boundary,
 								    )
 								    document = Document(content=TEXT)
 								    documents = preprocessor.process(document)
 								    for idx, doc in enumerate(documents):
 								        if idx < exp_doc_index:
 								            assert doc.meta["page"] == 1
 								        else:
 								            assert doc.meta["page"] == 2
-												fix: Fix the error of wrong page numbers when documents contain empty pages. (#3330)

* Fix the error of wrong page numbers when documents contain empty pages.

* Reformat using git hooks.

* Use a more descriptive placeholder
											
										
										
											2022-10-18 17:51:02 +02:00
+								def test_page_number_extraction_on_empty_pages():
 								    """
 								    Often "marketing" documents contain pages without text (visuals only). When extracting page numbers, these pages should be counted as well to avoid
 								    issues when mapping results back to the original document.
 								    """
 								    preprocessor = PreProcessor(add_page_number=True, split_by="word", split_length=7, split_overlap=0)
 								    text_page_one = "This is a text on page one."
 								    text_page_three = "This is a text on page three."
 								    # this is what we get from PDFToTextConverter in case of an "empty" page
 								    document_with_empty_pages = f"{text_page_one}\f\f{text_page_three}"
 								    document = Document(content=document_with_empty_pages)
 								    documents = preprocessor.process(document)
 								    assert documents[0].meta["page"] == 1
 								    assert documents[1].meta["page"] == 3
 								    # verify the placeholder for the empty page has been removed
 								    assert documents[0].content.strip() == text_page_one
 								    assert documents[1].content.strip() == text_page_three
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								def test_substitute_page_break():
 								    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
 								    # sentences should not be replaced.
 								    result = PreProcessor._substitute_page_breaks(TEXT)
 								    assert result[211:221] == "[NEW_PAGE]"
 								    assert result[654] == "\f"