haystack/test/nodes/test_file_converter.py

from typing import List

import os
import sys
from pathlib import Path
import subprocess
import csv

import pytest

from haystack import Document
from haystack.nodes import (
    MarkdownConverter,
    DocxToTextConverter,
    PDFToTextConverter,
    PDFToTextOCRConverter,
    TikaConverter,
    AzureConverter,
    ParsrConverter,
    TextConverter,
    CsvTextConverter,
    PreProcessor,
)

from ..conftest import SAMPLES_PATH


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
def test_convert(Converter):
    converter = Converter()
    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
    pages = document.content.split("\f")

    assert (
        len(pages) != 1 and pages[0] != ""
    ), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'

    assert len(pages) == 4  # the sample PDF file has four pages.
    assert pages[0] != ""  # the page 1 of PDF contains text.
    assert pages[2] == ""  # the page 3 of PDF file is empty.
    # assert text is retained from the document.
    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
    page_standard_whitespace = " ".join(pages[0].split())
    assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace


# Marked as integration because it uses poppler, which is not installed in the unit tests suite
@pytest.mark.integration
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Poppler not installed on Windows CI")
def test_pdftoppm_command_format():
    # Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
    # Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
    # This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
    proc = subprocess.Popen(
        ["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    out, err = proc.communicate()
    # If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
    assert (
        not err
    ), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_command_whitespaces(Converter):
    converter = Converter()

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
        "documents"
    ][0]
    assert "ɪ" in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_encoding(Converter):
    converter = Converter()

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
    assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_layout(Converter):
    converter = Converter(keep_physical_layout=True)

    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
    assert str(document.content).startswith("This is the second test sentence.")


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_pdf_ligatures(Converter):
    converter = Converter()

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
    assert "ﬀ" not in document.content
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
        0
    ]
    assert "ﬀ" in document.content
    assert "ɪ" in document.content

    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
        "documents"
    ][0]
    assert "ﬀ" in document.content
    assert "ɪ" not in document.content


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_page_range(Converter):
    converter = Converter()
    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
    pages = document.content.split("\f")

    assert (
        len(pages) == 4
    )  # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
    assert pages[0] == ""  # the page 1 was skipped.
    assert pages[1] != ""  # the page 2 is not empty.
    assert pages[2] == ""  # the page 3 is empty.


@pytest.mark.parametrize("Converter", [PDFToTextConverter])
def test_page_range_numbers(Converter):
    converter = Converter()
    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]

    preprocessor = PreProcessor(
        split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
    )
    documents = preprocessor.process([document])

    assert documents[1].meta["page"] == 4


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_table_removal(Converter):
    converter = Converter(remove_numeric_tables=True)
    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
    pages = document.content.split("\f")
    # assert numeric rows are removed from the table.
    assert "324" not in pages[0]
    assert "54x growth" not in pages[0]


@pytest.mark.tika
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
def test_language_validation(Converter, caplog):
    converter = Converter(valid_languages=["en"])
    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
    assert "sample_pdf_1.pdf is not one of ['en']." not in caplog.text

    converter = Converter(valid_languages=["de"])
    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
    assert "sample_pdf_1.pdf is not one of ['de']." in caplog.text


def test_docx_converter():
    converter = DocxToTextConverter()
    document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
    assert document.content.startswith("Sample Docx File")


def test_markdown_converter():
    converter = MarkdownConverter()
    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
    assert document.content.startswith("\nWhat to build with Haystack")
    assert "# git clone https://github.com/deepset-ai/haystack.git" not in document.content


def test_markdown_converter_headline_extraction():
    expected_headlines = [
        ("What to build with Haystack", 1),
        ("Core Features", 1),
        ("Quick Demo", 1),
        ("2nd level headline for testing purposes", 2),
        ("3rd level headline for testing purposes", 3),
    ]

    converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]

    # Check if correct number of headlines are extracted
    assert len(document.meta["headlines"]) == 5
    for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
        # Check if correct headline and level is extracted
        assert extracted_headline["headline"] == expected_headline
        assert extracted_headline["level"] == expected_level

        # Check if correct start_idx is extracted
        start_idx = extracted_headline["start_idx"]
        hl_len = len(extracted_headline["headline"])
        assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]


def test_markdown_converter_frontmatter_to_meta():
    converter = MarkdownConverter(add_frontmatter_to_meta=True)
    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
    assert document.meta["type"] == "intro"
    assert document.meta["date"] == "1.1.2023"


def test_markdown_converter_remove_code_snippets():
    converter = MarkdownConverter(remove_code_snippets=False)
    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
    assert document.content.startswith("pip install farm-haystack")


def test_azure_converter():
    # Check if Form Recognizer endpoint and credential key in environment variables
    if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
        converter = AzureConverter(
            endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
            credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
            save_json=True,
        )

        docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
        assert len(docs) == 2
        assert docs[0].content_type == "table"
        assert docs[0].content.shape[0] == 4  # number of rows
        assert docs[0].content.shape[1] == 5  # number of columns, Form Recognizer assumes there are 5 columns
        assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
        assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
        assert (
            docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
            "standardized and their\nspecification is published only on "
            "Adobe's website. Many of them are also not\nsupported by "
            "popular third-party implementations of PDF."
        )
        assert docs[0].meta["following_context"] == ""
        assert docs[0].meta["page"] == 1

        assert docs[1].content_type == "text"
        assert docs[1].content.startswith("A sample PDF file")


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter():
    converter = ParsrConverter()

    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
    assert len(docs) == 2
    assert docs[0].content_type == "table"
    assert docs[0].content.shape[0] == 4  # number of rows
    assert docs[0].content.shape[1] == 4
    assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
    assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
    assert (
        docs[0].meta["preceding_context"] == "speciﬁcation. These proprietary technologies are not "
        "standardized and their\nspeciﬁcation is published only on "
        "Adobe's website. Many of them are also not\nsupported by popular "
        "third-party implementations of PDF."
    )
    assert docs[0].meta["following_context"] == ""
    assert docs[0].meta["page"] == 1

    assert docs[1].content_type == "text"
    assert docs[1].content.startswith("A sample PDF ﬁle")
    assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
def test_parsr_converter_headline_extraction():
    expected_headlines = [
        [("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
        [
            ("Lorem ipsum", 1),
            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
            ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
            ("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
            ("In eleifend velit vitae libero sollicitudin euismod.", 2),
        ],
    ]

    converter = ParsrConverter()

    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
    assert len(docs) == 2

    for doc, expectation in zip(docs, expected_headlines):
        for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
            # Check if correct headline and level is extracted
            assert extracted_headline["headline"] == expected_headline
            assert extracted_headline["level"] == expected_level

            # Check if correct start_idx is extracted
            if doc.content_type == "text":
                start_idx = extracted_headline["start_idx"]
                hl_len = len(extracted_headline["headline"])
                assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]


def test_id_hash_keys_from_pipeline_params():
    doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
    meta_1 = {"key": "a"}
    meta_2 = {"key": "b"}
    meta = [meta_1, meta_2]

    converter = TextConverter()
    output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
    documents = output["documents"]
    unique_ids = set(d.id for d in documents)

    assert len(documents) == 2
    assert len(unique_ids) == 2


def write_as_csv(data: List[List[str]], file_path: Path):
    with open(file_path, "w") as f:
        writer = csv.writer(f)
        writer.writerows(data)


@pytest.mark.integration
def test_csv_to_document_with_qa_headers(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_headers.csv"
    rows = [
        ["question", "answer"],
        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
    ]
    write_as_csv(rows, csv_path)

    output, edge = node.run(file_paths=csv_path)
    assert edge == "output_1"
    assert "documents" in output
    assert len(output["documents"]) == 1

    doc = output["documents"][0]
    assert isinstance(doc, Document)
    assert doc.content == "What is Haystack ?"
    assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."


@pytest.mark.integration
def test_csv_to_document_with_wrong_qa_headers(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
    rows = [
        ["wrong", "headers"],
        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
    ]
    write_as_csv(rows, csv_path)

    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
        node.run(file_paths=csv_path)


@pytest.mark.integration
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
    rows = [
        ["wrong", "answers"],
        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
    ]
    write_as_csv(rows, csv_path)

    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
        node.run(file_paths=csv_path)


@pytest.mark.integration
def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
    rows = [
        ["question", "wrong"],
        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
    ]
    write_as_csv(rows, csv_path)

    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
        node.run(file_paths=csv_path)


@pytest.mark.integration
def test_csv_to_document_with_one_column(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
    rows = [["question"], ["What is Haystack ?"]]
    write_as_csv(rows, csv_path)

    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
        node.run(file_paths=csv_path)


@pytest.mark.integration
def test_csv_to_document_with_three_columns(tmp_path):
    node = CsvTextConverter()
    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
    rows = [
        ["question", "answer", "notes"],
        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
    ]
    write_as_csv(rows, csv_path)

    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
        node.run(file_paths=csv_path)


@pytest.mark.integration
def test_csv_to_document_many_files(tmp_path):
    csv_paths = []
    for i in range(5):
        node = CsvTextConverter()
        csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
        csv_paths.append(csv_path)
        rows = [
            ["question", "answer"],
            [
                f"{i}. What is Haystack ?",
                f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
            ],
        ]
        write_as_csv(rows, csv_path)

    output, edge = node.run(file_paths=csv_paths)
    assert edge == "output_1"
    assert "documents" in output
    assert len(output["documents"]) == 5

    for i in range(5):
        doc = output["documents"][i]
        assert isinstance(doc, Document)
        assert doc.content == f"{i}. What is Haystack ?"
        assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."
-												feat: Add `CsvTextConverter` (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
											
										
										
											2023-01-23 15:56:36 +01:00
+								from typing import List
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
+								import os
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								import sys
 								from pathlib import Path
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								import subprocess
-												feat: Add `CsvTextConverter` (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
											
										
										
											2023-01-23 15:56:36 +01:00
+								import csv
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								import pytest
-												Revert "Add Tika Converter (#314)"

This reverts commit 5ef59b1901da6d51bfa085683321a243228d4fc9.

											
										
										
											2020-08-17 11:13:52 +02:00
-												feat: Add `CsvTextConverter` (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
											
										
										
											2023-01-23 15:56:36 +01:00
+								from haystack import Document
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								from haystack.nodes import (
 								    MarkdownConverter,
 								    DocxToTextConverter,
 								    PDFToTextConverter,
 								    PDFToTextOCRConverter,
 								    TikaConverter,
 								    AzureConverter,
 								    ParsrConverter,
-												Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-24 09:55:09 +02:00
+								    TextConverter,
-												feat: Add `CsvTextConverter` (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
											
										
										
											2023-01-23 15:56:36 +01:00
+								    CsvTextConverter,
-												feat: Add page range support to PDF converters. (#3965)

* feat: add start and eng page to PDF converters

* docs: add missing docstrings

* refactor: change list set up, add docstrings and comment

* fix: add missing parameter

* tests: add page range basic test

* tests: test correct page numbers

* tests: remove OCR page range test
*Poppler and Tesseract not installed on CI

* fix: remove mobile change error
											
										
										
											2023-01-30 10:09:22 -03:00
+								    PreProcessor,
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								)
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												[CI refactoring] Categorize tests into folders (#2554)

* Categorize tests into folders

* Fix linux_ci.yml and an import

* Wrong path
											
										
										
											2022-05-17 10:55:53 +02:00
+								from ..conftest import SAMPLES_PATH
-												Improve dependency management (#1994)

* Fist attempt at using setup.cfg for dependency management

* Trying the new package on the CI and in Docker too

* Add composite extras_require

* Add the safe_import function for document store imports and add some try-catch statements on rest_api and ui imports

* Fix bug on class import and rephrase error message

* Introduce typing for optional modules and add type: ignore in sparse.py

* Include importlib_metadata backport for py3.7

* Add colab group to extra_requires

* Fix pillow version

* Fix grpcio

* Separate out the crawler as another extra

* Make paths relative in rest_api and ui

* Update the test matrix in the CI

* Add try catch statements around the optional imports too to account for direct imports

* Never mix direct deps with self-references and add ES deps to the base install

* Refactor several paths in tests to make them insensitive to the execution path

* Include tstadel review and re-introduce Milvus1 in the tests suite, to fix

* Wrap pdf conversion utils into safe_import

* Update some tutorials and rever Milvus1 as default for now, see #2067

* Fix mypy config


Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-01-26 18:12:55 +01:00
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_convert(Converter):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter()
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]["documents"][0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    pages = document.content.split("\f")
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
 								    assert (
 								        len(pages) != 1 and pages[0] != ""
 								    ), f'{type(converter).__name__} did return a single empty page indicating a potential issue with your installed poppler version. Try installing via "conda install -c conda-forge poppler" and check test_pdftoppm_command_format()'
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
+								    assert len(pages) == 4  # the sample PDF file has four pages.
 								    assert pages[0] != ""  # the page 1 of PDF contains text.
 								    assert pages[2] == ""  # the page 3 of PDF file is empty.
-												Add ImageToTextConverter and PDFToTextOCRConverter that utilize OCR (#1349)

* add image.py converter

* add PDFtoImageConverter

* add init to PDFtoImageConverter and classes to __init__

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* update imagetotext pipeline

* revert change in base.py in file_conv

* Update base.py

* Update pdf.py

* add ocr file_converter testcase & update dockerfile

* fix tesseract exception message typo

* fix _image_to_text doctstring

* add tesseract installation to CI

* add tesseract installation to CI

* add content test for PDF OCR converter

* update PDFToTextOCRConverter constructor doctsring

* replace image files with tmp paths for image.py convert

* replace image files with tmp paths for image.py convert

* Update README.md

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-09-01 16:42:25 +02:00
+								    # assert text is retained from the document.
 								    # As whitespace can differ (\n," ", etc.), we standardize all to simple whitespace
 								    page_standard_whitespace = " ".join(pages[0].split())
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    assert "Adobe Systems made the PDF specification available free of charge in 1993." in page_standard_whitespace
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								# Marked as integration because it uses poppler, which is not installed in the unit tests suite
 								@pytest.mark.integration
 								@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Poppler not installed on Windows CI")
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								def test_pdftoppm_command_format():
 								    # Haystack's PDFToTextOCRConverter uses pdf2image, which calls pdftoppm internally.
 								    # Some installations of pdftoppm are incompatible with Haystack and won't raise an error but just return empty converted documents
 								    # This test runs pdftoppm directly to check whether pdftoppm accepts the command format that pdf2image uses in Haystack
 								    proc = subprocess.Popen(
 								        ["pdftoppm", f"{SAMPLES_PATH}/pdf/sample_pdf_1.pdf"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
 								    )
 								    out, err = proc.communicate()
 								    # If usage info of pdftoppm is sent to stderr then it's because Haystack's pdf2image uses an incompatible command format
 								    assert (
 								        not err
 								    ), 'Your installation of poppler is incompatible with Haystack. Try installing via "conda install -c conda-forge poppler"'
-												Prevent `PDFToTextConverter` from failing on PDFs with spaces in their names (#2786)

* Change split logic to list

* Fix wrong parameter for run

* Fix mypy error

* Fix layout/raw parameter

* Add test for filename with whitespaces on PDFToText

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-07-11 08:30:33 -03:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
 								def test_pdf_command_whitespaces(Converter):
 								    converter = Converter()
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample pdf file with spaces on file name.pdf")[0][
 								        "documents"
 								    ][0]
 								    assert "ɪ" in document.content
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-												Change default encoding for `PDFToTextConverter` from `Latin 1` to `UTF-8` (#2420)

* Change default encoding for PDFToTextConverter

* Update Documentation & Code Style

* Improve docstring

* Update Documentation & Code Style

* Add list of ligatures to ignore and add the possibility to modify such list at need

* Add docstring

* Add tests

* Rename parameter

* Update Documentation & Code Style

* Move implementation into the base converter to make mypy happier

* Update Documentation & Code Style

* mypy and pylint

* mypy

* move encoding parameter to init of PDFToTextConverter

* Update Documentation & Code Style

* make utf8 default and fix mypy

* Update Documentation & Code Style

* Update Documentation & Code Style

* remove note on encoding in tutorial8

* Update Documentation & Code Style

* skip OCRConverter and test converter.run

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2022-05-04 17:01:45 +02:00
+								def test_pdf_encoding(Converter):
 								    converter = Converter()
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", encoding="Latin1")[0]["documents"][0]
 								    assert "ɪ" not in document.content
-												feat: add public layout-base extraction support on PDFToTextConverter (#3137)

* feat(PDFToTextConverter): add option to get text in physical layout order

* test: add physical layout extraction test to PDFToTextConverter

* refactor: change layout parameter attribution places

* docs: manually trigger pre-commits

* docs: generate new docs to comply with pydoc-markdown style
											
										
										
											2022-09-13 11:55:21 -03:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
 								def test_pdf_layout(Converter):
 								    converter = Converter(keep_physical_layout=True)
 								    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_3.pdf")[0]
 								    assert str(document.content).startswith("This is the second test sentence.")
-												Remove encoding option from PDFToTextOCRConverter (#2553)

* remove encoding option from PDFToTextOCRConverter

* Update Documentation & Code Style

* add unused 'encoding' param to PDFToTextOCRConverter

* Update Documentation & Code Style

* call run instead of convert to use ligature replacing

* Update Documentation & Code Style

* add text to check installed poppler version

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-05-24 11:31:32 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
-												Change default encoding for `PDFToTextConverter` from `Latin 1` to `UTF-8` (#2420)

* Change default encoding for PDFToTextConverter

* Update Documentation & Code Style

* Improve docstring

* Update Documentation & Code Style

* Add list of ligatures to ignore and add the possibility to modify such list at need

* Add docstring

* Add tests

* Rename parameter

* Update Documentation & Code Style

* Move implementation into the base converter to make mypy happier

* Update Documentation & Code Style

* mypy and pylint

* mypy

* move encoding parameter to init of PDFToTextConverter

* Update Documentation & Code Style

* make utf8 default and fix mypy

* Update Documentation & Code Style

* Update Documentation & Code Style

* remove note on encoding in tutorial8

* Update Documentation & Code Style

* skip OCRConverter and test converter.run

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Julian Risch <julian.risch@deepset.ai>
											
										
										
											2022-05-04 17:01:45 +02:00
+								def test_pdf_ligatures(Converter):
 								    converter = Converter()
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf")[0]["documents"][0]
 								    assert "ﬀ" not in document.content
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={})[0]["documents"][
 
 								    ]
 								    assert "ﬀ" in document.content
 								    assert "ɪ" in document.content
 								    document = converter.run(file_paths=SAMPLES_PATH / "pdf" / "sample_pdf_2.pdf", known_ligatures={"ɪ": "i"})[0][
 								        "documents"
 								    ][0]
 								    assert "ﬀ" in document.content
 								    assert "ɪ" not in document.content
-												feat: Add page range support to PDF converters. (#3965)

* feat: add start and eng page to PDF converters

* docs: add missing docstrings

* refactor: change list set up, add docstrings and comment

* fix: add missing parameter

* tests: add page range basic test

* tests: test correct page numbers

* tests: remove OCR page range test
*Poppler and Tesseract not installed on CI

* fix: remove mobile change error
											
										
										
											2023-01-30 10:09:22 -03:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
 								def test_page_range(Converter):
 								    converter = Converter()
 								    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
 								    pages = document.content.split("\f")
 								    assert (
 								        len(pages) == 4
 								    )  # the sample PDF file has four pages, we skipped first (but we wanna correct number of pages)
 								    assert pages[0] == ""  # the page 1 was skipped.
 								    assert pages[1] != ""  # the page 2 is not empty.
 								    assert pages[2] == ""  # the page 3 is empty.
 								@pytest.mark.parametrize("Converter", [PDFToTextConverter])
 								def test_page_range_numbers(Converter):
 								    converter = Converter()
 								    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf", start_page=2)[0]
 								    preprocessor = PreProcessor(
 								        split_by="word", split_length=5, split_overlap=0, split_respect_sentence_boundary=False, add_page_number=True
 								    )
 								    documents = preprocessor.process([document])
 								    assert documents[1].meta["page"] == 4
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_table_removal(Converter):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(remove_numeric_tables=True)
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")[0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    pages = document.content.split("\f")
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
+								    # assert numeric rows are removed from the table.
 								    assert "324" not in pages[0]
 								    assert "54x growth" not in pages[0]
-												[RAG] Integrate "Retrieval-Augmented Generation" with Haystack (#484)

* Adding dummy generator implementation

* Adding tutorial to try the model

* Committing current non working code

* Committing current update where we need to call generate function directly and need to convert embedding to tensor way

* Addressing review comments.

* Refactoring finder, and implementing rag_generator class.

* Refined the implementation of RAGGenerator and now it is in clean shape

* Renaming RAGGenerator to RAGenerator

* Reverting change from finder.py and addressing review comments

* Remove support for RagSequenceForGeneration

* Utilizing embed_passage function from DensePassageRetriever

* Adding sample test data to verify generator output

* Updating testing script

* Updating testing script

* Fixing bug related to top_k

* Updating latest farm dependency

* Comment out farm dependency

* Reverting changes from TransformersReader

* Adding transformers dataset to compare transformers and haystack generator implementation

* Using generator_encoder instead of question_encoder to generate context_input_ids

* Adding workaround to install FARM dependency from master branch

* Removing unnecessary changes

* Fixing generator test

* Removing transformers datasets

* Fixing generator test

* Some cleanup and updating TODO comments

* Adding tutorial notebook

* Updating tutorials with comments

* Explicitly passing token model in RAG test

* Addressing review comments

* Fixing notebook

* Refactoring tests to reduce memory footprint

* Split generator tests in separate ci step and before running it reclaim memory by terminating containers

* Moving tika dependent test to separate dir

* Remove unwanted code

* Brining reader under session scope

* Farm is now session object hence restoring changes from default value

* Updating assert for pdf converter

* Dummy commit to trigger CI flow

* REducing memory footprint required for generator tests

* Fixing mypy issues

* Marking test with tika and elasticsearch markers. Reverting changes in CI and pytest splits

* reducing changes

* Fixing CI

* changing elastic search ci

* Fixing test error

* Disabling return of embedding

* Marking generator test as well

* Refactoring tutorials

* Increasing ES memory to 750M

* Trying another fix for ES CI

* Reverting CI changes

* Splitting tests in CI

* Generator and non-generator markers split

* Adding pytest.ini to add markers and enable strict-markers option

* Reducing elastic search container memory

* Simplifying generator test by using documents with embedding directly

* Bump up farm to 0.5.0
											
										
										
											2020-10-30 18:06:02 +01:00
+								@pytest.mark.tika
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-												Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform

* Windows CI by default installing pytorch gpu hence updating CI to pick cpu version

* fixing mac cache build issue

* updating windows pip install command for torch

* another attempt

* updating ci

* Adding sudo

* fixing ls failure on windows

* another attempt to fix build issue

* Saving env variable of test files

* Adding debug log

* Github action differ on windows

* adding debug

* anohter attempt

* Windows have different ways to receive env

* fixing template

* minor fx

* Adding debug

* Removing use of json

* Adding back fromJson

* addin toJson

* removing print

* anohter attempt

* disabling parallel run at least for testing

* installing docker for mac runner

* correcting docker install command

* Linux dockers are not suported in windows

* Removing mac changes

* Upgrading pytorch

* using lts pytorch

* Separating win and ubuntu

* Install java 11

* enabling linux container env

* docker cli command

* docker cli command

* start elastic service

* List all service

* correcting service name

* Attempt to fix multiple test run

* convert to json

* another attempt to check

* Updating build cache step

* attempt

* Add tika

* Separating windows CI

* Changing CI name

* Skipping test which does not work in windows

* Skipping tests for windows

* create cleanup function in conftest

* adding skipif marker on tests

* Run windows PR on only push to master

* Addressing review comments

* Enabling windows ci for this PR

* Tika init is being called when importing tika function

* handling tika import issue

* handling tika import issue in test

* Fixing import issue

* removing tika fixure

* Removing fixture from tests

* Disable windows ci on pull request

* Add back extra pytorch install step

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-10-29 13:52:28 +05:30
+								def test_language_validation(Converter, caplog):
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(valid_languages=["en"])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								    assert "sample_pdf_1.pdf is not one of ['en']." not in caplog.text
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Add Tika Converter (#314)

											
										
										
											2020-08-17 11:21:09 +02:00
+								    converter = Converter(valid_languages=["de"])
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								    assert "sample_pdf_1.pdf is not one of ['de']." in caplog.text
-												Add PDF text extraction (#109)


											
										
										
											2020-06-08 11:07:19 +02:00
-												Revamp CI (#825)


											
										
										
											2021-02-12 13:38:54 +01:00
+								def test_docx_converter():
 								    converter = DocxToTextConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "docx" / "sample_docx.docx")[0]
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert document.content.startswith("Sample Docx File")
-												Add Markdown file convertor (#875)


											
										
										
											2021-03-23 16:31:26 +01:00
 								def test_markdown_converter():
 								    converter = MarkdownConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
-												fix: removing code block in `MarkdownConverter` (#3960)

* first attempt to add frontmatter of markdown to the metadata

* remove bug fix

* running black and pre-commit

* moving the import line

* adding a test

* adding pydoc

* fix to removing code blocks in markdown converter

* adding a test

* fixing a test

* improving tests

* adding language to code block
											
										
										
											2023-01-27 15:25:54 +01:00
+								    assert document.content.startswith("\nWhat to build with Haystack")
 								    assert "# git clone https://github.com/deepset-ai/haystack.git" not in document.content
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
-												feat: Extraction of headlines in markdown files (#3445)

* Extract headings from markdown files + adapt PreProcessor

* Add tests

* Fix mypy

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update haystack/nodes/file_converter/markdown.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Apply black

* Add PR feedback

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-10-26 11:57:55 +02:00
+								def test_markdown_converter_headline_extraction():
 								    expected_headlines = [
 								        ("What to build with Haystack", 1),
 								        ("Core Features", 1),
 								        ("Quick Demo", 1),
 								        ("2nd level headline for testing purposes", 2),
 								        ("3rd level headline for testing purposes", 3),
 								    ]
 								    converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
 								    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
 								    # Check if correct number of headlines are extracted
 								    assert len(document.meta["headlines"]) == 5
 								    for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
 								        # Check if correct headline and level is extracted
 								        assert extracted_headline["headline"] == expected_headline
 								        assert extracted_headline["level"] == expected_level
 								        # Check if correct start_idx is extracted
 								        start_idx = extracted_headline["start_idx"]
 								        hl_len = len(extracted_headline["headline"])
 								        assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]
-												feat: add frontmatter to meta in `MarkdownConverter` (#3953)

* first attempt to add frontmatter of markdown to the metadata

* remove bug fix

* running black and pre-commit

* moving the import line

* adding a test

* adding pydoc
											
										
										
											2023-01-26 17:15:02 +01:00
+								def test_markdown_converter_frontmatter_to_meta():
 								    converter = MarkdownConverter(add_frontmatter_to_meta=True)
 								    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
 								    assert document.meta["type"] == "intro"
 								    assert document.meta["date"] == "1.1.2023"
-												fix: removing code block in `MarkdownConverter` (#3960)

* first attempt to add frontmatter of markdown to the metadata

* remove bug fix

* running black and pre-commit

* moving the import line

* adding a test

* adding pydoc

* fix to removing code blocks in markdown converter

* adding a test

* fixing a test

* improving tests

* adding language to code block
											
										
										
											2023-01-27 15:25:54 +01:00
+								def test_markdown_converter_remove_code_snippets():
 								    converter = MarkdownConverter(remove_code_snippets=False)
 								    document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
 								    assert document.content.startswith("pip install farm-haystack")
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
+								def test_azure_converter():
 								    # Check if Form Recognizer endpoint and credential key in environment variables
 								    if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        converter = AzureConverter(
 								            endpoint=os.environ["AZURE_FORMRECOGNIZER_ENDPOINT"],
 								            credential_key=os.environ["AZURE_FORMRECOGNIZER_KEY"],
 								            save_json=True,
 								        )
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								        docs = converter.convert(file_path=SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf")
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
+								        assert len(docs) == 2
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[0].content_type == "table"
 								        assert docs[0].content.shape[0] == 4  # number of rows
 								        assert docs[0].content.shape[1] == 5  # number of columns, Form Recognizer assumes there are 5 columns
 								        assert list(docs[0].content.columns) == ["", "Column 1", "", "Column 2", "Column 3"]
 								        assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "", "$6345.", ""]
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								        assert (
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								            docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								            "standardized and their\nspecification is published only on "
 								            "Adobe's website. Many of them are also not\nsupported by "
 								            "popular third-party implementations of PDF."
 								        )
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[0].meta["following_context"] == ""
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								        assert docs[0].meta["page"] == 1
-												Add AzureConverter to support table parsing from documents (#1813)

* Add FormRecognizerConverter

* Change signature of convert method + change return type of all converters

* Adapt preprocessing util to new return type of converters

* Parametrize number of lines used for surrounding context of table

* Change name from FormRecognizerConverter to AzureConverter

* Set version of azure-ai-formrecognizer package

* Change tutorial 8 based on new return type of converters

* Add tests

* Add latest docstring and tutorial changes

* Fix typo

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
											
										
										
											2021-11-29 18:44:20 +01:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        assert docs[1].content_type == "text"
 								        assert docs[1].content.startswith("A sample PDF file")
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
-												[CI Refactoring] Workflow refactoring (#2576)

* Unify CI tests (from #2466)

* Update Documentation & Code Style

* Change folder names

* Fix markers list

* Remove marker 'slow', replaced with 'integration'

* Soften children check

* Start ES first so it has time to boot while Python is setup

* Run the full workflow

* Try to make pip upgrade on Windows

* Set KG tests as integration

* Update Documentation & Code Style

* typo

* faster pylint

* Make Pylint use the cache

* filter diff files for pylint

* debug pylint statement

* revert pylint changes

* Remove path from asserted log (fails on Windows)

* Skip preprocessor test on Windows

* Tackling Windows specific failures

* Fix pytest command for windows suites

* Remove \ from command

* Move poppler test into integration

* Skip opensearch test on windows

* Add tolerance in reader sas score for Windows

* Another pytorch approx

* Raise time limit for unit tests :(

* Skip poppler test on Windows CI

* Specify to pull with FF only in docs check

* temporarily run the docs check immediately

* Allow merge commit for now

* Try without fetch depth

* Accelerating test

* Accelerating test

* Add repository and ref alongside fetch-depth

* Separate out code&docs check from tests

* Use setup-python cache

* Delete custom action

* Remove the pull step in the docs check, will find a way to run on bot commits

* Add requirements.txt in .github for caching

* Actually install dependencies

* Change deps group for pylint

* Unclear why the requirements.txt is still required :/

* Fix the code check python setup

* Install all deps for pylint

* Make the autoformat check depend on tests and doc updates workflows

* Try installing dependencies in another order

* Try again to install the deps

* quoting the paths

* Ad back the requirements

* Try again to install rest_api and ui

* Change deps group

* Duplicate haystack install line

* See if the cache is the problem

* Disable also in mypy, who knows

* split the install step

* Split install step everywhere

* Revert "Separate out code&docs check from tests"

This reverts commit 1cd59b15ffc5b984e1d642dcbf4c8ccc2bb6c9bd.

* Add back the action

* Proactive support for audio (see text2speech branch)

* Fix label generator tests

* Remove install of libsndfile1 on win temporarily

* exclude audio tests on win

* install ffmpeg for integration tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-07 09:23:03 +02:00
+								@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
+								def test_parsr_converter():
 								    converter = ParsrConverter()
-												Apply black formatting (#2115)

* Testing black on ui/

* Applying black on docstores

* Add latest docstring and tutorial changes

* Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too

* Remove comments

* Relax constraints on pydoc-markdown

* Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade

* Fix a couple of bugs

* Add a type: ignore that was missing somehow

* Give path to black

* Apply Black

* Apply Black

* Relocate a couple of type: ignore

* Update documentation

* Make Linux CI run after applying Black

* Triggering Black

* Apply Black

* Remove dependency, does not work well

* Remove manually double trailing commas

* Update documentation

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-03 13:43:18 +01:00
+								    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_1.pdf").absolute()))
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
+								    assert len(docs) == 2
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[0].content_type == "table"
 								    assert docs[0].content.shape[0] == 4  # number of rows
 								    assert docs[0].content.shape[1] == 4
 								    assert list(docs[0].content.columns) == ["", "Column 1", "Column 2", "Column 3"]
 								    assert list(docs[0].content.iloc[3]) == ["D", "$54.35", "$6345.", ""]
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								    assert (
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								        docs[0].meta["preceding_context"] == "speciﬁcation. These proprietary technologies are not "
-												Fix surrounding context extraction in `ParsrConverter`  (#2162)

* Fix surrounding context extraction

* Update Documentation & Code Style

* Unify Parsr and Azure + add test

* Update Documentation & Code Style

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-02-24 14:58:36 +01:00
+								        "standardized and their\nspeciﬁcation is published only on "
 								        "Adobe's website. Many of them are also not\nsupported by popular "
 								        "third-party implementations of PDF."
 								    )
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[0].meta["following_context"] == ""
-												feat: Add page number to Documents coming from PDFConverters and PreProcessor (#2932)

* Add page number to Documents coming from PDFConverters and PreProcessor

* Fix mypy

* Update API Docs

* Update API Docs

* Remove unused imports

* Generate JSON schema

* Generate JSON schema

* Make test variable shorter

* Make regex a separate function

* Move counting of page breaks to a function

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update API Documentation

* Don't create instance for testing staticmethod

* Update haystack/nodes/preprocessor/preprocessor.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
											
										
										
											2022-08-09 15:55:27 +02:00
+								    assert docs[0].meta["page"] == 1
-												Add ParsrConverter (#1931)

* Add ParsrConverter

* Fix typing error + add Parsr to Linux CI

* Fix valid_language for all converters + fix context generation for ParsrConverter

* Remove ParsrConverter test from WindowsCI

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2021-12-30 10:15:11 +01:00
-												Change return types of indexing pipeline nodes (#2342)

* Change return types of file converters

* Change return types of preprocessor

* Change return types of crawler

* Adapt utils to functions to new return types

* Adapt __init__.py to new method names

* Prevent circular imports

* Update Documentation & Code Style

* Let DocStores' run method accept Documents

* Adapt tests to new return types

* Update Documentation & Code Style

* Put "# type: ignore" to right place

* Remove id_hash_keys property from Document primitive

* Update Documentation & Code Style

* Adapt tests to new return types and missing id_hash_keys property

* Fix mypy

* Fix mypy

* Adapt PDFToTextOCRConverter

* Remove id_hash_keys from RestAPI tests

* Update Documentation & Code Style

* Rename tests

* Remove redundant setting of content_type="text"

* Add DeprecationWarning

* Add id_hash_keys to elasticsearch_index_to_document_store

* Change document type from dict to Docuemnt in PreProcessor test

* Fix file path in Tutorial 5

* Remove added output in Tutorial 5

* Update Documentation & Code Style

* Fix file_paths in Tutorial 9 + fix gz files in fetch_archive_from_http

* Adapt tutorials to new return types

* Adapt tutorial 14 to new return types

* Update Documentation & Code Style

* Change assertions to HaystackErrors

* Import HaystackError correctly

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-03-29 13:53:35 +02:00
+								    assert docs[1].content_type == "text"
 								    assert docs[1].content.startswith("A sample PDF ﬁle")
 								    assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
-												Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-24 09:55:09 +02:00
-												feat: Add headline extraction to `ParsrConverter` (#3488)

* Add headline extraction to ParsrConverter

* Add sample PDF file

* Add test

* Use extract_headlines if set in convert method

* Integrate PR feedback
											
										
										
											2022-10-31 19:00:02 +01:00
+								@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Parsr not running on Windows CI")
 								def test_parsr_converter_headline_extraction():
 								    expected_headlines = [
 								        [("Lorem ipsum", 1), ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2)],
 								        [
 								            ("Lorem ipsum", 1),
 								            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit. Nunc ac faucibus odio.", 2),
 								            ("Cras fringilla ipsum magna, in fringilla dui commodo\na.", 2),
 								            ("Lorem ipsum dolor sit amet, consectetur adipiscing\nelit.", 2),
 								            ("Maecenas mauris lectus, lobortis et purus mattis, blandit\ndictum tellus.", 2),
 								            ("In eleifend velit vitae libero sollicitudin euismod.", 2),
 								        ],
 								    ]
 								    converter = ParsrConverter()
 								    docs = converter.convert(file_path=str((SAMPLES_PATH / "pdf" / "sample_pdf_4.pdf").absolute()))
 								    assert len(docs) == 2
 								    for doc, expectation in zip(docs, expected_headlines):
 								        for extracted_headline, (expected_headline, expected_level) in zip(doc.meta["headlines"], expectation):
 								            # Check if correct headline and level is extracted
 								            assert extracted_headline["headline"] == expected_headline
 								            assert extracted_headline["level"] == expected_level
 								            # Check if correct start_idx is extracted
 								            if doc.content_type == "text":
 								                start_idx = extracted_headline["start_idx"]
 								                hl_len = len(extracted_headline["headline"])
 								                assert extracted_headline["headline"] == doc.content[start_idx : start_idx + hl_len]
-												Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
											
										
										
											2022-06-24 09:55:09 +02:00
+								def test_id_hash_keys_from_pipeline_params():
 								    doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
 								    meta_1 = {"key": "a"}
 								    meta_2 = {"key": "b"}
 								    meta = [meta_1, meta_2]
 								    converter = TextConverter()
 								    output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
 								    documents = output["documents"]
 								    unique_ids = set(d.id for d in documents)
 								    assert len(documents) == 2
 								    assert len(unique_ids) == 2
-												feat: Add `CsvTextConverter` (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
											
										
										
											2023-01-23 15:56:36 +01:00
 								def write_as_csv(data: List[List[str]], file_path: Path):
 								    with open(file_path, "w") as f:
 								        writer = csv.writer(f)
 								        writer.writerows(data)
 								@pytest.mark.integration
 								def test_csv_to_document_with_qa_headers(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_headers.csv"
 								    rows = [
 								        ["question", "answer"],
 								        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
 								    ]
 								    write_as_csv(rows, csv_path)
 								    output, edge = node.run(file_paths=csv_path)
 								    assert edge == "output_1"
 								    assert "documents" in output
 								    assert len(output["documents"]) == 1
 								    doc = output["documents"][0]
 								    assert isinstance(doc, Document)
 								    assert doc.content == "What is Haystack ?"
 								    assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
 								@pytest.mark.integration
 								def test_csv_to_document_with_wrong_qa_headers(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
 								    rows = [
 								        ["wrong", "headers"],
 								        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
 								    ]
 								    write_as_csv(rows, csv_path)
 								    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
 								        node.run(file_paths=csv_path)
 								@pytest.mark.integration
 								def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
 								    rows = [
 								        ["wrong", "answers"],
 								        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
 								    ]
 								    write_as_csv(rows, csv_path)
 								    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
 								        node.run(file_paths=csv_path)
 								@pytest.mark.integration
 								def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
 								    rows = [
 								        ["question", "wrong"],
 								        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
 								    ]
 								    write_as_csv(rows, csv_path)
 								    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
 								        node.run(file_paths=csv_path)
 								@pytest.mark.integration
 								def test_csv_to_document_with_one_column(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
 								    rows = [["question"], ["What is Haystack ?"]]
 								    write_as_csv(rows, csv_path)
 								    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
 								        node.run(file_paths=csv_path)
 								@pytest.mark.integration
 								def test_csv_to_document_with_three_columns(tmp_path):
 								    node = CsvTextConverter()
 								    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
 								    rows = [
 								        ["question", "answer", "notes"],
 								        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
 								    ]
 								    write_as_csv(rows, csv_path)
 								    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
 								        node.run(file_paths=csv_path)
 								@pytest.mark.integration
 								def test_csv_to_document_many_files(tmp_path):
 								    csv_paths = []
 								    for i in range(5):
 								        node = CsvTextConverter()
 								        csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
 								        csv_paths.append(csv_path)
 								        rows = [
 								            ["question", "answer"],
 								            [
 								                f"{i}. What is Haystack ?",
 								                f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
 								            ],
 								        ]
 								        write_as_csv(rows, csv_path)
 								    output, edge = node.run(file_paths=csv_paths)
 								    assert edge == "output_1"
 								    assert "documents" in output
 								    assert len(output["documents"]) == 5
 								    for i in range(5):
 								        doc = output["documents"][i]
 								        assert isinstance(doc, Document)
 								        assert doc.content == f"{i}. What is Haystack ?"
 								        assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."