haystack/test/nodes/test_file_converter.py

import os
import subprocess

import pytest

from haystack.nodes import (
    MarkdownConverter,
    DocxToTextConverter,
    PDFToTextConverter,
    PDFToTextOCRConverter,
    TikaConverter,
    AzureConverter,
    ParsrConverter,
)

from ..conftest import SAMPLES_PATH


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
def test_convert(Converter):
    converter = Converter()
    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
    pages = document.content.split("\f")

    assert (
        len(pages) != 1 and pages[0] != ""
    ), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'

    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
    assert pages[2] == ""  # the page 3 of PDF file is empty.
    # assert text is retained from the document.
    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
    page_standard_whitespace = " ".join(pages[0].split())
    assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace


def test_pdftoppm_command_format():
    # Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
    # Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
    # This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
    proc = subprocess.Popen(
        ["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    out, err = proc.communicate()
    # If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
    assert (
        not err
    ), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter):
    converter = Converter()

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
    assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
    converter = Converter()

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
    assert "ﬀ" not in document.content
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
        0
    ]
    assert "ﬀ" in document.content
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
        "documents"
    ][0]
    assert "ﬀ" in document.content
    assert "ɪ" not in document.content


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter):
    converter = Converter(remove_numeric_tables=True)
    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
    pages = document.content.split("\f")
    # assert numeric rows are removed from the table.
    assert "324" not in pages[0]
    assert "54x growth" not in pages[0]


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, caplog):
    converter = Converter(valid_languages=["en"])
    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
    assert "samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text

    converter = Converter(valid_languages=["de"])
    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
    assert "samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text


def test_docx_converter():
    converter = DocxToTextConverter()
    document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
    assert document.content.startswith("Sample Docx File")


def test_markdown_converter():
    converter = MarkdownConverter()
    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
    assert document.content.startswith("What to build with Haystack")


def test_azure_converter():
    # Check if Form Recognizer endpoint and credential key in environment variables
    if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
        converter = AzureConverter(
            endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
            credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
            save_json=True,
        )

        docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
        assert len(docs) == 2
        assert docs[0].content_type == "table"
        assert docs[0].content.shape[0] == 4  # number of rows
        assert docs[0].content.shape[1] == 5  # number of columns, Form Recognizer assumes there are 5 columns
        assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
        assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
        assert (
            docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
            "standardized and their\nspecification is published only on "
            "Adobe's website. Many of them are also not\nsupported by "
            "popular third-party implementations of PDF."
        )
        assert docs[0].meta["following_context"] == ""

        assert docs[1].content_type == "text"
        assert docs[1].content.startswith("A sample PDF file")


def test_parsr_converter():
    converter = ParsrConverter()

    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
    assert len(docs) == 2
    assert docs[0].content_type == "table"
    assert docs[0].content.shape[0] == 4  # number of rows
    assert docs[0].content.shape[1] == 4
    assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
    assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
    assert (
        docs[0].meta["preceding_context"] == "speciﬁcation. These proprietary technologies are not "
        "standardized and their\nspeciﬁcation is published only on "
        "Adobe's website. Many of them are also not\nsupported by popular "
        "third-party implementations of PDF."
    )
    assert docs[0].meta["following_context"] == ""

    assert docs[1].content_type == "text"
    assert docs[1].content.startswith("A sample PDF ﬁle")
    assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
+								import os
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								import subprocess
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								import pytest
-												Revert "Add Tika Converter (#314)"

This reverts commit 5ef59b1901da6d51bfa085683321a243228d4fc9.

											
										
										
											2020-08-17 11:13:52 +02:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								from haystack.nodes import (
 								    MarkdownConverter,
 								    DocxToTextConverter,
 								    PDFToTextConverter,
 								    PDFToTextOCRConverter,
 								    TikaConverter,
 								    AzureConverter,
 								    ParsrConverter,
 								)
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												[CI refactoring] Categorize tests into folders (#2554)

* Categorize tests into folders

* Fix linux_ci.yml and an import

* Wrong path
											
										
										
											2022-05-17 10:55:53 +02:00
+								from ..conftest import SAMPLES_PATH
-												Improve dependency management (#1994)

* Fist attempt at using setup.cfg for dependency management

* Trying the new package on the CI and in Docker too

* Add composite extras_require

* Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports

* Fix bug on class import and rephrase error message

* Introduce typing for optional modules and add type: ignore in sparse.py

* Include importlib_metadata backport for py3.7

* Add colab group to extra_requires

* Fix pillow version

* Fix grpcio

* Separate out the crawler as another extra

* Make paths relative in rest_api and ui

* Update the test matrix in the CI

* Add try catch statements around the optional imports too to account for direct imports

* Never mix direct deps with self-references and add ES deps to the base install

* Refactor several paths in tests to make them insensitive to the execution path

* Include tstadel review and re-introduce Milvus1 in the tests suite, to fix

* Wrap pdf conversion utils into safe_import

* Update some tutorials and rever Milvus1 as default for now, see #2067

* Fix mypy config


Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-01-26 18:12:55 +01:00
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_convert(Converter):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter()
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    pages = document.content.split("\f")
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
 								    assert (
 								        len(pages) != 1 and pages[0] != ""
 								    ), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
+								    assert len(pages) == 4  # the sample PDF file has four pages.
 								    assert pages[0] != ""  # the page 1 of PDF contains text.
 								    assert pages[2] == ""  # the page 3 of PDF file is empty.
-												Add ImageToTextConverter and PDFToTextOCRConverter that utilize OCR (#1349)

* add image.py converter

* add PDFtoImageConverter

* add init to PDFtoImageConverter and classes to __init__

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* revert change in base.py in file_conv

* Update base.py

* Update pdf.py

* add ocr file_converter testcase & update dockerfile

* fix tesseract exception message typo

* fix _image_to_text doctstring

* add tesseract installation to CI

* add tesseract installation to CI

* add content test for PDF OCR converter

* update PDFToTextOCRConverter constructor doctsring

* replace image files with tmp paths for image.py convert

* replace image files with tmp paths for image.py convert

* Update README.md

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-09-01 16:42:25 +02:00
+								    # assert text is retained from the document.
 								    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
 								    page_standard_whitespace = " ".join(pages[0].split())
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								def test_pdftoppm_command_format():
 								    # Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
 								    # Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
 								    # This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
 								    proc = subprocess.Popen(
 								        ["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
 								    )
 								    out, err = proc.communicate()
 								    # If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
 								    assert (
 								        not err
 								    ), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'
 								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-												Change default encoding for `PDFToTextConverter` from `Latin 1` to `UTF-8` (#2420)

* Change default encoding for PDFToTextConverter

* Update Documentation & Code Style

* Improve docstring

* Update Documentation & Code Style

* Add list of ligatures to ignore and add the possibility to modify such list at need

* Add docstring

* Add tests

* Rename parameter

* Update Documentation & Code Style

* Move implementation into the base converter to make mypy happier

* Update Documentation & Code Style

* mypy and pylint

* mypy

* move encoding parameter to init of PDFToTextConverter

* Update Documentation & Code Style

* make utf8 default and fix mypy

* Update Documentation & Code Style

* Update Documentation & Code Style

* remove note on encoding in tutorial8

* Update Documentation & Code Style

* skip OCRConverter and test converter.run

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2022-05-04 17:01:45 +02:00
+								def test_pdf_encoding(Converter):
 								    converter = Converter()
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
 								    assert "ɪ" not in document.content
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-												Change default encoding for `PDFToTextConverter` from `Latin 1` to `UTF-8` (#2420)

* Change default encoding for PDFToTextConverter

* Update Documentation & Code Style

* Improve docstring

* Update Documentation & Code Style

* Add list of ligatures to ignore and add the possibility to modify such list at need

* Add docstring

* Add tests

* Rename parameter

* Update Documentation & Code Style

* Move implementation into the base converter to make mypy happier

* Update Documentation & Code Style

* mypy and pylint

* mypy

* move encoding parameter to init of PDFToTextConverter

* Update Documentation & Code Style

* make utf8 default and fix mypy

* Update Documentation & Code Style

* Update Documentation & Code Style

* remove note on encoding in tutorial8

* Update Documentation & Code Style

* skip OCRConverter and test converter.run

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2022-05-04 17:01:45 +02:00
+								def test_pdf_ligatures(Converter):
 								    converter = Converter()
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 								    assert "ﬀ" not in document.content
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
 
 								    ]
 								    assert "ﬀ" in document.content
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
 								        "documents"
 								    ][0]
 								    assert "ﬀ" in document.content
 								    assert "ɪ" not in document.content
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_table_removal(Converter):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(remove_numeric_tables=True)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    pages = document.content.split("\f")
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
+								    # assert numeric rows are removed from the table.
 								    assert "324" not in pages[0]
 								    assert "54x growth" not in pages[0]
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_language_validation(Converter, caplog):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(valid_languages=["en"])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
 								    assert "samples/pdf/sample_pdf_1.pdf is not one of ['en']." not in caplog.text
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(valid_languages=["de"])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
 								    assert "samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Revamp CI (#825)


											
										
										
											2021-02-12 13:38:54 +01:00
+								def test_docx_converter():
 								    converter = DocxToTextConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert document.content.startswith("Sample Docx File")
-												Add Markdown file convertor (#875)


											
										
										
											2021-03-23 16:31:26 +01:00
 								def test_markdown_converter():
 								    converter = MarkdownConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert document.content.startswith("What to build with Haystack")
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
 								def test_azure_converter():
 								    # Check if Form Recognizer endpoint and credential key in environment variables
 								    if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        converter = AzureConverter(
 								            endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
 								            credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
 								            save_json=True,
 								        )
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
+								        assert len(docs) == 2
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[0].content_type == "table"
 								        assert docs[0].content.shape[0] == 4  # number of rows
 								        assert docs[0].content.shape[1] == 5  # number of columns, Form Recognizer assumes there are 5 columns
 								        assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
 								        assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								        assert (
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								            docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								            "standardized and their\nspecification is published only on "
 								            "Adobe's website. Many of them are also not\nsupported by "
 								            "popular third-party implementations of PDF."
 								        )
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[0].meta["following_context"] == ""
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[1].content_type == "text"
 								        assert docs[1].content.startswith("A sample PDF file")
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
 								def test_parsr_converter():
 								    converter = ParsrConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
+								    assert len(docs) == 2
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[0].content_type == "table"
 								    assert docs[0].content.shape[0] == 4  # number of rows
 								    assert docs[0].content.shape[1] == 4
 								    assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
 								    assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								    assert (
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        docs[0].meta["preceding_context"] == "speciﬁcation. These proprietary technologies are not "
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								        "standardized and their\nspeciﬁcation is published only on "
 								        "Adobe's website. Many of them are also not\nsupported by popular "
 								        "third-party implementations of PDF."
 								    )
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[0].meta["following_context"] == ""
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[1].content_type == "text"
 								    assert docs[1].content.startswith("A sample PDF ﬁle")
 								    assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")