haystack/test/nodes/test_preprocessor.py

import sys
from pathlib import Path
import os

import pytest

from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor

from ..conftest import SAMPLES_PATH


NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"


TEXT = """
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.\f

This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.

This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation\f like Dr.
in the sentence.
"""

LEGAL_TEXT_PT = """
A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
bens imóveis, é norma especial e posterior ao Código de Defesa do
Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
25/8/2020).

A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
denúncia contra o Senador ou Deputado, por crime ocorrido após a
diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
por iniciativa de partido político nela representado e pelo voto da maioria de
seus membros, poderá, até a decisão final, sustar o andamento da ação”.
Vale ressaltar, contudo, que existem, antes do encaminhamento ao
Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
redação final aprovada. O projeto aprovado será encaminhado em autógrafos
ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
do RICD e arts. 328 a 331 do RISF.
"""


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
        language="en",
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        language="ca",
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
def test_preprocess_sentence_split_custom_models(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=LEGAL_TEXT_PT)
    preprocessor = PreProcessor(
        split_length=split_length,
        split_overlap=0,
        split_by="sentence",
        split_respect_sentence_boundary=False,
        language="pt",
        tokenizer_model_folder=NLTK_TEST_MODELS,
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


def test_preprocess_word_split():
    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == 11

    preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    for i, doc in enumerate(documents):
        if i == 0:
            assert len(doc.content.split(" ")) == 14
        assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
    assert len(documents) == 8

    preprocessor = PreProcessor(
        split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
    )
    documents = preprocessor.process(document)
    assert len(documents) == 5

    preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    assert len(documents) == 15


@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
def test_preprocess_passage_split(split_length_and_results):
    split_length, expected_documents_count = split_length_and_results

    document = Document(content=TEXT)
    preprocessor = PreProcessor(
        split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
    assert len(documents) == expected_documents_count


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(
        file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
    )  # file contains header/footer

    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
    documents = preprocessor.process(document)

    assert len(documents) == 1

    assert "This is a header." not in documents[0].content
    assert "footer" not in documents[0].content


def test_remove_substrings():
    document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")

    # check that the file contains the substrings we are about to remove
    assert "This is a header." in document.content
    assert "wiki" in document.content
    assert "🪲" in document.content
    assert "whitespace" in document.content
    assert "✨" in document.content

    preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
    documents = preprocessor.process(document)

    assert "This is a header." not in documents[0].content
    assert "wiki" not in documents[0].content
    assert "🪲" not in documents[0].content
    assert "whitespace" in documents[0].content
    assert "✨" in documents[0].content


def test_id_hash_keys_from_pipeline_params():
    document_1 = Document(content="This is a document.", meta={"key": "a"})
    document_2 = Document(content="This is a document.", meta={"key": "b"})
    assert document_1.id == document_2.id

    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
    documents = output["documents"]
    unique_ids = set(d.id for d in documents)

    assert len(documents) == 4
    assert len(unique_ids) == 4


# test_input is a tuple consisting of the parameters for split_length, split_overlap and split_respect_sentence_boundary
# and the expected index in the output list of Documents where the page number changes from 1 to 2
@pytest.mark.parametrize("test_input", [(10, 0, True, 5), (10, 0, False, 4), (10, 5, True, 6), (10, 5, False, 7)])
def test_page_number_extraction(test_input):
    split_length, overlap, resp_sent_boundary, exp_doc_index = test_input
    preprocessor = PreProcessor(
        add_page_number=True,
        split_by="word",
        split_length=split_length,
        split_overlap=overlap,
        split_respect_sentence_boundary=resp_sent_boundary,
    )
    document = Document(content=TEXT)
    documents = preprocessor.process(document)
    for idx, doc in enumerate(documents):
        if idx < exp_doc_index:
            assert doc.meta["page"] == 1
        else:
            assert doc.meta["page"] == 2


def test_substitute_page_break():
    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
    # sentences should not be replaced.
    result = PreProcessor._substitute_page_breaks(TEXT)
    assert result[211:221] == "[NEW_PAGE]"
    assert result[654] == "\f"
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								import sys
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								from pathlib import Path
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								import os
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								import pytest
-												Fix PreProcessor test (#2290)

* Adding Document import, missing from recent PR

* Fix mypy signature warning too

* reduce diff to minimum

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-09 13:46:47 +01:00
+								from haystack import Document
-												Improve dependency management (#1994)

* Fist attempt at using setup.cfg for dependency management

* Trying the new package on the CI and in Docker too

* Add composite extras_require

* Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports

* Fix bug on class import and rephrase error message

* Introduce typing for optional modules and add type: ignore in sparse.py

* Include importlib_metadata backport for py3.7

* Add colab group to extra_requires

* Fix pillow version

* Fix grpcio

* Separate out the crawler as another extra

* Make paths relative in rest_api and ui

* Update the test matrix in the CI

* Add try catch statements around the optional imports too to account for direct imports

* Never mix direct deps with self-references and add ES deps to the base install

* Refactor several paths in tests to make them insensitive to the execution path

* Include tstadel review and re-introduce Milvus1 in the tests suite, to fix

* Wrap pdf conversion utils into safe_import

* Update some tutorials and rever Milvus1 as default for now, see #2067

* Fix mypy config


Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-01-26 18:12:55 +01:00
+								from haystack.nodes.file_converter.pdf import PDFToTextConverter
 								from haystack.nodes.preprocessor.preprocessor import PreProcessor
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
-												[CI refactoring] Categorize tests into folders (#2554)

* Categorize tests into folders

* Fix linux_ci.yml and an import

* Wrong path
											
										
										
											2022-05-17 10:55:53 +02:00
+								from ..conftest import SAMPLES_PATH
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
 								NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								TEXT = """
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
 								paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.\f
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in
 								paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation\f like Dr.
 								in the sentence.
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								"""
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								LEGAL_TEXT_PT = """
 								A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
 								bens imóveis, é norma especial e posterior ao Código de Defesa do
 								Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
 								devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
 								da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
 /8/2020).
 								A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
 								ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
 								denúncia contra o Senador ou Deputado, por crime ocorrido após a
 								diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
 								por iniciativa de partido político nela representado e pelo voto da maioria de
 								seus membros, poderá, até a decisão final, sustar o andamento da ação”.
 								Vale ressaltar, contudo, que existem, antes do encaminhamento ao
 								Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
 								com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
 								quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
 								redação final aprovada. O projeto aprovado será encaminhado em autógrafos
 								ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
 								do RICD e arts. 328 a 331 do RISF.
 								"""
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=TEXT)
 								    preprocessor = PreProcessor(
 								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
 								        language="en",
 								    )
 								    documents = preprocessor.process(document)
 								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
 								@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
 								def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=TEXT)
 								    preprocessor = PreProcessor(
 								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        language="ca",
 								    )
 								    documents = preprocessor.process(document)
 								    assert len(documents) == expected_documents_count
 								@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
 								def test_preprocess_sentence_split_custom_models(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
 								    document = Document(content=LEGAL_TEXT_PT)
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length,
 								        split_overlap=0,
 								        split_by="sentence",
 								        split_respect_sentence_boundary=False,
 								        language="pt",
 								        tokenizer_model_folder=NLTK_TEST_MODELS,
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    )
 								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
 								def test_preprocess_word_split():
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
 								        split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
 								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
 								    assert len(documents) == 11
-												Create Preprocessing Tutorial (#706)

* WIP: First version of preprocessing tutorial

* stride renamed overlap, ipynb and py files created

* rename split_stride in test

* Update preprocessor api documentation

* define order for markdown files

* define order of modules in api docs

* Add colab links

* Incorporate review feedback

Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com>
											
										
										
											2021-01-06 15:54:05 +01:00
+								    preprocessor = PreProcessor(split_length=15, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    for i, doc in enumerate(documents):
-												Add needed whitespace before sentence start (#582)


											
										
										
											2020-11-13 14:14:24 +01:00
+								        if i == 0:
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								            assert len(doc.content.split(" ")) == 14
 								        assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
-												Add needed whitespace before sentence start (#582)


											
										
										
											2020-11-13 14:14:24 +01:00
+								    assert len(documents) == 8
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
 								        split_length=40, split_overlap=10, split_by="word", split_respect_sentence_boundary=True
 								    )
-												Redone: Fix concatenation of sentences in PreProcessor. Add stride for word-based splits with sentence boundaries (#641)

* Update preprocessor.py

Concatenation of sentences done correctly. Stride functionality enabled for splitting by words while respecting sentence boundaries.

* Simplify code, add test

Co-authored-by: Krak91 <45461739+Krak91@users.noreply.github.com>
											
										
										
											2020-12-09 16:12:36 +01:00
+								    documents = preprocessor.process(document)
 								    assert len(documents) == 5
-												Improve preprocessing and adding of eval data (#780)

* Remove empty document when splitting text

* Move error message of problematic ids to a highler level
											
										
										
											2021-02-01 17:08:27 +01:00
+								    preprocessor = PreProcessor(split_length=5, split_overlap=0, split_by="word", split_respect_sentence_boundary=True)
 								    documents = preprocessor.process(document)
 								    assert len(documents) == 15
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
 								def test_preprocess_passage_split(split_length_and_results):
 								    split_length, expected_documents_count = split_length_and_results
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    document = Document(content=TEXT)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    preprocessor = PreProcessor(
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								        split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    )
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								    documents = preprocessor.process(document)
-												Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
											
										
										
											2022-07-21 04:50:45 -03:00
+								    assert len(documents) == expected_documents_count
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
+								def test_clean_header_footer():
 								    converter = PDFToTextConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(
 								        file_path=Path(SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")
 								    )  # file contains header/footer
-												Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing

* Adjust PDF conversion tests

* Add tests for Preprocessing

* Add requirement

* Fix tests

* Ignore decoding errors for TextConverter

* Rename split_size to split_length

* Adjust tests

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2020-10-15 10:42:08 +02:00
 								    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
 								    documents = preprocessor.process(document)
 								    assert len(documents) == 1
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." not in documents[0].content
 								    assert "footer" not in documents[0].content
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								def test_remove_substrings():
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    document = Document(content="This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								    # check that the file contains the substrings we are about to remove
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." in document.content
 								    assert "wiki" in document.content
 								    assert "🪲" in document.content
 								    assert "whitespace" in document.content
 								    assert "✨" in document.content
-												Remove substrings basic implementation (#2152)

* Remove substrings basic implementation

* Update Documentation & Code Style

* Remove substrings basic tests

* Simplify test

											
										
										
											2022-03-08 15:49:56 +01:00
 								    preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
 								    documents = preprocessor.process(document)
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert "This is a header." not in documents[0].content
 								    assert "wiki" not in documents[0].content
 								    assert "🪲" not in documents[0].content
 								    assert "whitespace" in documents[0].content
 								    assert "✨" in documents[0].content
-												Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-24 09:55:09 +02:00
 								def test_id_hash_keys_from_pipeline_params():
 								    document_1 = Document(content="This is a document.", meta={"key": "a"})
 								    document_2 = Document(content="This is a document.", meta={"key": "b"})
 								    assert document_1.id == document_2.id
 								    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
 								    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
 								    documents = output["documents"]
 								    unique_ids = set(d.id for d in documents)
 								    assert len(documents) == 4
 								    assert len(unique_ids) == 4
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
 								# test_input is a tuple consisting of the parameters for split_length, split_overlap and split_respect_sentence_boundary
 								# and the expected index in the output list of Documents where the page number changes from 1 to 2
 								@pytest.mark.parametrize("test_input", [(10, 0, True, 5), (10, 0, False, 4), (10, 5, True, 6), (10, 5, False, 7)])
 								def test_page_number_extraction(test_input):
 								    split_length, overlap, resp_sent_boundary, exp_doc_index = test_input
 								    preprocessor = PreProcessor(
 								        add_page_number=True,
 								        split_by="word",
 								        split_length=split_length,
 								        split_overlap=overlap,
 								        split_respect_sentence_boundary=resp_sent_boundary,
 								    )
 								    document = Document(content=TEXT)
 								    documents = preprocessor.process(document)
 								    for idx, doc in enumerate(documents):
 								        if idx < exp_doc_index:
 								            assert doc.meta["page"] == 1
 								        else:
 								            assert doc.meta["page"] == 2
 								def test_substitute_page_break():
 								    # Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
 								    # sentences should not be replaced.
 								    result = PreProcessor._substitute_page_breaks(TEXT)
 								    assert result[211:221] == "[NEW_PAGE]"
 								    assert result[654] == "\f"