haystack/test/components/converters/test_json.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import json
from unittest.mock import patch
from pathlib import Path
import logging

import pytest

from haystack.components.converters import JSONConverter
from haystack.dataclasses import ByteStream


test_data = [
    {
        "year": "1997",
        "category": "literature",
        "laureates": [
            {
                "id": "674",
                "firstname": "Dario",
                "surname": "Fo",
                "motivation": "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden",
                "share": "1",
            }
        ],
    },
    {
        "year": "1986",
        "category": "medicine",
        "laureates": [
            {
                "id": "434",
                "firstname": "Stanley",
                "surname": "Cohen",
                "motivation": "for their discoveries of growth factors",
                "share": "2",
            },
            {
                "id": "435",
                "firstname": "Rita",
                "surname": "Levi-Montalcini",
                "motivation": "for their discoveries of growth factors",
                "share": "2",
            },
        ],
    },
    {
        "year": "1938",
        "category": "physics",
        "laureates": [
            {
                "id": "46",
                "firstname": "Enrico",
                "surname": "Fermi",
                "motivation": "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, and for his related discovery of nuclear reactions brought about by slow neutrons",
                "share": "1",
            }
        ],
    },
]


def test_init_without_jq_schema_and_content_key():
    with pytest.raises(
        ValueError, match="No `jq_schema` nor `content_key` specified. Set either or both to extract data."
    ):
        JSONConverter()


@patch("haystack.components.converters.json.jq_import")
def test_init_without_jq_schema_and_missing_dependency(jq_import):
    converter = JSONConverter(content_key="foo")
    jq_import.check.assert_not_called()
    assert converter._jq_schema is None
    assert converter._content_key == "foo"
    assert converter._meta_fields is None


@patch("haystack.components.converters.json.jq_import")
def test_init_with_jq_schema_and_missing_dependency(jq_import):
    jq_import.check.side_effect = ImportError
    with pytest.raises(ImportError):
        JSONConverter(jq_schema=".laureates[].motivation")


def test_init_with_jq_schema():
    converter = JSONConverter(jq_schema=".")
    assert converter._jq_schema == "."
    assert converter._content_key is None
    assert converter._meta_fields is None


def test_to_dict():
    converter = JSONConverter(
        jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
    )

    assert converter.to_dict() == {
        "type": "haystack.components.converters.json.JSONConverter",
        "init_parameters": {
            "content_key": "motivation",
            "jq_schema": ".laureates[]",
            "extra_meta_fields": {"firstname", "surname"},
        },
    }


def test_from_dict():
    data = {
        "type": "haystack.components.converters.json.JSONConverter",
        "init_parameters": {
            "content_key": "motivation",
            "jq_schema": ".laureates[]",
            "extra_meta_fields": ["firstname", "surname"],
        },
    }
    converter = JSONConverter.from_dict(data)

    assert converter._jq_schema == ".laureates[]"
    assert converter._content_key == "motivation"
    assert converter._meta_fields == ["firstname", "surname"]


def test_run(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]

    converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content
        == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "
        "upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
    assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
    assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
    assert result["documents"][2].meta == {"file_path": str(second_test_file)}
    assert (
        result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
        "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
        "reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {}


def test_run_with_non_json_file(tmpdir, caplog):
    test_file = Path(tmpdir / "test_file.md")
    test_file.write_text("This is not a JSON file.", "utf-8")

    sources = [test_file]
    converter = JSONConverter(".laureates | .motivation")

    caplog.clear()
    with caplog.at_level(logging.WARNING):
        result = converter.run(sources=sources)

    records = caplog.records
    assert len(records) == 1
    assert (
        records[0].msg
        == f"Failed to extract text from {test_file}. Skipping it. Error: parse error: Invalid numeric literal at line 1, column 5"
    )
    assert result == {"documents": []}


def test_run_with_bad_filter(tmpdir, caplog):
    test_file = Path(tmpdir / "test_file.json")
    test_file.write_text(json.dumps(test_data[0]), "utf-8")

    sources = [test_file]
    converter = JSONConverter(".laureates | .motivation")

    caplog.clear()
    with caplog.at_level(logging.WARNING):
        result = converter.run(sources=sources)

    records = caplog.records
    assert len(records) == 1
    assert (
        records[0].msg
        == f'Failed to extract text from {test_file}. Skipping it. Error: Cannot index array with string "motivation"'
    )
    assert result == {"documents": []}


def test_run_with_single_meta(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    meta = {"creation_date": "1945-05-25T00:00:00"}
    converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
    result = converter.run(sources=sources, meta=meta)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content
        == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "
        "upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
    assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
    assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
    assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
    assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}
    assert (
        result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
        "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
        "reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {"creation_date": "1945-05-25T00:00:00"}


def test_run_with_meta_list(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    meta = [
        {"creation_date": "1945-05-25T00:00:00"},
        {"creation_date": "1943-09-03T00:00:00"},
        {"creation_date": "1989-11-09T00:00:00"},
    ]
    converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
    result = converter.run(sources=sources, meta=meta)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content
        == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "
        "upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}
    assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
    assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
    assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
    assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}
    assert (
        result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
        "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
        "reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {"creation_date": "1989-11-09T00:00:00"}


def test_run_with_meta_list_of_differing_length(tmpdir):
    sources = ["random_file.json"]

    meta = [{}, {}]
    converter = JSONConverter(jq_schema=".")
    with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
        converter.run(sources=sources, meta=meta)


def test_run_with_jq_schema_and_content_key(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation")
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
        "upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
    assert result["documents"][1].content == "for their discoveries of growth factors"
    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
    assert result["documents"][2].content == "for their discoveries of growth factors"
    assert result["documents"][2].meta == {"file_path": str(second_test_file)}
    assert (
        result["documents"][3].content == "for his demonstrations of the existence of new "
        "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
        "reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {}


def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    converter = JSONConverter(
        jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
    )
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
        "upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fo"}
    assert result["documents"][1].content == "for their discoveries of growth factors"
    assert result["documents"][1].meta == {
        "file_path": str(second_test_file),
        "firstname": "Stanley",
        "surname": "Cohen",
    }
    assert result["documents"][2].content == "for their discoveries of growth factors"
    assert result["documents"][2].meta == {
        "file_path": str(second_test_file),
        "firstname": "Rita",
        "surname": "Levi-Montalcini",
    }
    assert (
        result["documents"][3].content == "for his demonstrations of the existence of new "
        "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
        "reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {"firstname": "Enrico", "surname": "Fermi"}


def test_run_with_content_key(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    converter = JSONConverter(content_key="category")
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 3
    assert result["documents"][0].content == "literature"
    assert result["documents"][0].meta == {"file_path": str(first_test_file)}
    assert result["documents"][1].content == "medicine"
    assert result["documents"][1].meta == {"file_path": str(second_test_file)}
    assert result["documents"][2].content == "physics"
    assert result["documents"][2].meta == {}


def test_run_with_content_key_and_extra_meta_fields(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    converter = JSONConverter(content_key="category", extra_meta_fields={"year"})
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 3
    assert result["documents"][0].content == "literature"
    assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"}
    assert result["documents"][1].content == "medicine"
    assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"}
    assert result["documents"][2].content == "physics"
    assert result["documents"][2].meta == {"year": "1938"}


def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")

    first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
    second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
    byte_stream = ByteStream.from_string(json.dumps(test_data[2]))

    sources = [str(first_test_file), second_test_file, byte_stream]
    converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation", extra_meta_fields="*")
    result = converter.run(sources=sources)
    assert len(result) == 1
    assert len(result["documents"]) == 4
    assert (
        result["documents"][0].content
        == "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden"
    )
    assert result["documents"][0].meta == {
        "file_path": str(first_test_file),
        "id": "674",
        "firstname": "Dario",
        "surname": "Fo",
        "share": "1",
    }
    assert result["documents"][1].content == "for their discoveries of growth factors"
    assert result["documents"][1].meta == {
        "file_path": str(second_test_file),
        "id": "434",
        "firstname": "Stanley",
        "surname": "Cohen",
        "share": "2",
    }
    assert result["documents"][2].content == "for their discoveries of growth factors"
    assert result["documents"][2].meta == {
        "file_path": str(second_test_file),
        "id": "435",
        "firstname": "Rita",
        "surname": "Levi-Montalcini",
        "share": "2",
    }
    assert (
        result["documents"][3].content
        == "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, "
        "and for his related discovery of nuclear reactions brought about by slow neutrons"
    )
    assert result["documents"][3].meta == {"id": "46", "firstname": "Enrico", "surname": "Fermi", "share": "1"}
feat: Add `JSONConverter` Component (#8397) * Add JSONConverter Component * Handle some corner cases * Add JSONConverter to pydoc config * Add a way to extract all non content fields as metadata * Small fix in docstring * Fix tests * docstrings upd * Update json.py --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> 2024-09-25 12:34:51 +02:00			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`

			`import json`
			`from unittest.mock import patch`
			`from pathlib import Path`
			`import logging`

			`import pytest`

			`from haystack.components.converters import JSONConverter`
			`from haystack.dataclasses import ByteStream`


			`test_data = [`
			`{`
			`"year": "1997",`
			`"category": "literature",`
			`"laureates": [`
			`{`
			`"id": "674",`
			`"firstname": "Dario",`
			`"surname": "Fo",`
			`"motivation": "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden",`
			`"share": "1",`
			`}`
			`],`
			`},`
			`{`
			`"year": "1986",`
			`"category": "medicine",`
			`"laureates": [`
			`{`
			`"id": "434",`
			`"firstname": "Stanley",`
			`"surname": "Cohen",`
			`"motivation": "for their discoveries of growth factors",`
			`"share": "2",`
			`},`
			`{`
			`"id": "435",`
			`"firstname": "Rita",`
			`"surname": "Levi-Montalcini",`
			`"motivation": "for their discoveries of growth factors",`
			`"share": "2",`
			`},`
			`],`
			`},`
			`{`
			`"year": "1938",`
			`"category": "physics",`
			`"laureates": [`
			`{`
			`"id": "46",`
			`"firstname": "Enrico",`
			`"surname": "Fermi",`
			`"motivation": "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, and for his related discovery of nuclear reactions brought about by slow neutrons",`
			`"share": "1",`
			`}`
			`],`
			`},`
			`]`


			`def test_init_without_jq_schema_and_content_key():`
			`with pytest.raises(`
			ValueError, match="No `jq_schema` nor `content_key` specified. Set either or both to extract data."
			`):`
			`JSONConverter()`


			`@patch("haystack.components.converters.json.jq_import")`
			`def test_init_without_jq_schema_and_missing_dependency(jq_import):`
			`converter = JSONConverter(content_key="foo")`
			`jq_import.check.assert_not_called()`
			`assert converter._jq_schema is None`
			`assert converter._content_key == "foo"`
			`assert converter._meta_fields is None`


			`@patch("haystack.components.converters.json.jq_import")`
			`def test_init_with_jq_schema_and_missing_dependency(jq_import):`
			`jq_import.check.side_effect = ImportError`
			`with pytest.raises(ImportError):`
			`JSONConverter(jq_schema=".laureates[].motivation")`


			`def test_init_with_jq_schema():`
			`converter = JSONConverter(jq_schema=".")`
			`assert converter._jq_schema == "."`
			`assert converter._content_key is None`
			`assert converter._meta_fields is None`


			`def test_to_dict():`
			`converter = JSONConverter(`
			`jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}`
			`)`

			`assert converter.to_dict() == {`
			`"type": "haystack.components.converters.json.JSONConverter",`
			`"init_parameters": {`
			`"content_key": "motivation",`
			`"jq_schema": ".laureates[]",`
			`"extra_meta_fields": {"firstname", "surname"},`
			`},`
			`}`


			`def test_from_dict():`
			`data = {`
			`"type": "haystack.components.converters.json.JSONConverter",`
			`"init_parameters": {`
			`"content_key": "motivation",`
			`"jq_schema": ".laureates[]",`
			`"extra_meta_fields": ["firstname", "surname"],`
			`},`
			`}`
			`converter = JSONConverter.from_dict(data)`

			`assert converter._jq_schema == ".laureates[]"`
			`assert converter._content_key == "motivation"`
			`assert converter._meta_fields == ["firstname", "surname"]`


			`def test_run(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`

			`converter = JSONConverter(jq_schema='.laureates[] \| .firstname + " " + .surname + " " + .motivation')`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content`
			`== "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "`
			`"upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file)}`
			`assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file)}`
			`assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {"file_path": str(second_test_file)}`
			`assert (`
			`result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "`
			`"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "`
			`"reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {}`


			`def test_run_with_non_json_file(tmpdir, caplog):`
			`test_file = Path(tmpdir / "test_file.md")`
			`test_file.write_text("This is not a JSON file.", "utf-8")`

			`sources = [test_file]`
			`converter = JSONConverter(".laureates \| .motivation")`

			`caplog.clear()`
			`with caplog.at_level(logging.WARNING):`
			`result = converter.run(sources=sources)`

			`records = caplog.records`
			`assert len(records) == 1`
			`assert (`
			`records[0].msg`
			`== f"Failed to extract text from {test_file}. Skipping it. Error: parse error: Invalid numeric literal at line 1, column 5"`
			`)`
			`assert result == {"documents": []}`


			`def test_run_with_bad_filter(tmpdir, caplog):`
			`test_file = Path(tmpdir / "test_file.json")`
			`test_file.write_text(json.dumps(test_data[0]), "utf-8")`

			`sources = [test_file]`
			`converter = JSONConverter(".laureates \| .motivation")`

			`caplog.clear()`
			`with caplog.at_level(logging.WARNING):`
			`result = converter.run(sources=sources)`

			`records = caplog.records`
			`assert len(records) == 1`
			`assert (`
			`records[0].msg`
			`== f'Failed to extract text from {test_file}. Skipping it. Error: Cannot index array with string "motivation"'`
			`)`
			`assert result == {"documents": []}`


			`def test_run_with_single_meta(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`meta = {"creation_date": "1945-05-25T00:00:00"}`
			`converter = JSONConverter(jq_schema='.laureates[] \| .firstname + " " + .surname + " " + .motivation')`
			`result = converter.run(sources=sources, meta=meta)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content`
			`== "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "`
			`"upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}`
			`assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}`
			`assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"}`
			`assert (`
			`result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "`
			`"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "`
			`"reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {"creation_date": "1945-05-25T00:00:00"}`


			`def test_run_with_meta_list(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`meta = [`
			`{"creation_date": "1945-05-25T00:00:00"},`
			`{"creation_date": "1943-09-03T00:00:00"},`
			`{"creation_date": "1989-11-09T00:00:00"},`
			`]`
			`converter = JSONConverter(jq_schema='.laureates[] \| .firstname + " " + .surname + " " + .motivation')`
			`result = converter.run(sources=sources, meta=meta)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content`
			`== "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and "`
			`"upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"}`
			`assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}`
			`assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"}`
			`assert (`
			`result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "`
			`"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "`
			`"reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {"creation_date": "1989-11-09T00:00:00"}`


			`def test_run_with_meta_list_of_differing_length(tmpdir):`
			`sources = ["random_file.json"]`

			`meta = [{}, {}]`
			`converter = JSONConverter(jq_schema=".")`
			`with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):`
			`converter.run(sources=sources, meta=meta)`


			`def test_run_with_jq_schema_and_content_key(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation")`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "`
			`"upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file)}`
			`assert result["documents"][1].content == "for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file)}`
			`assert result["documents"][2].content == "for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {"file_path": str(second_test_file)}`
			`assert (`
			`result["documents"][3].content == "for his demonstrations of the existence of new "`
			`"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "`
			`"reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {}`


			`def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`converter = JSONConverter(`
			`jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}`
			`)`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "`
			`"upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fo"}`
			`assert result["documents"][1].content == "for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {`
			`"file_path": str(second_test_file),`
			`"firstname": "Stanley",`
			`"surname": "Cohen",`
			`}`
			`assert result["documents"][2].content == "for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {`
			`"file_path": str(second_test_file),`
			`"firstname": "Rita",`
			`"surname": "Levi-Montalcini",`
			`}`
			`assert (`
			`result["documents"][3].content == "for his demonstrations of the existence of new "`
			`"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "`
			`"reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {"firstname": "Enrico", "surname": "Fermi"}`


			`def test_run_with_content_key(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`converter = JSONConverter(content_key="category")`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 3`
			`assert result["documents"][0].content == "literature"`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file)}`
			`assert result["documents"][1].content == "medicine"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file)}`
			`assert result["documents"][2].content == "physics"`
			`assert result["documents"][2].meta == {}`


			`def test_run_with_content_key_and_extra_meta_fields(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`converter = JSONConverter(content_key="category", extra_meta_fields={"year"})`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 3`
			`assert result["documents"][0].content == "literature"`
			`assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"}`
			`assert result["documents"][1].content == "medicine"`
			`assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"}`
			`assert result["documents"][2].content == "physics"`
			`assert result["documents"][2].meta == {"year": "1938"}`


			`def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir):`
			`first_test_file = Path(tmpdir / "first_test_file.json")`
			`second_test_file = Path(tmpdir / "second_test_file.json")`

			`first_test_file.write_text(json.dumps(test_data[0]), "utf-8")`
			`second_test_file.write_text(json.dumps(test_data[1]), "utf-8")`
			`byte_stream = ByteStream.from_string(json.dumps(test_data[2]))`

			`sources = [str(first_test_file), second_test_file, byte_stream]`
			`converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation", extra_meta_fields="*")`
			`result = converter.run(sources=sources)`
			`assert len(result) == 1`
			`assert len(result["documents"]) == 4`
			`assert (`
			`result["documents"][0].content`
			`== "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden"`
			`)`
			`assert result["documents"][0].meta == {`
			`"file_path": str(first_test_file),`
			`"id": "674",`
			`"firstname": "Dario",`
			`"surname": "Fo",`
			`"share": "1",`
			`}`
			`assert result["documents"][1].content == "for their discoveries of growth factors"`
			`assert result["documents"][1].meta == {`
			`"file_path": str(second_test_file),`
			`"id": "434",`
			`"firstname": "Stanley",`
			`"surname": "Cohen",`
			`"share": "2",`
			`}`
			`assert result["documents"][2].content == "for their discoveries of growth factors"`
			`assert result["documents"][2].meta == {`
			`"file_path": str(second_test_file),`
			`"id": "435",`
			`"firstname": "Rita",`
			`"surname": "Levi-Montalcini",`
			`"share": "2",`
			`}`
			`assert (`
			`result["documents"][3].content`
			`== "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, "`
			`"and for his related discovery of nuclear reactions brought about by slow neutrons"`
			`)`
			`assert result["documents"][3].meta == {"id": "46", "firstname": "Enrico", "surname": "Fermi", "share": "1"}`