# SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 import json import logging import os from pathlib import Path from unittest.mock import patch import pytest from haystack.components.converters import JSONConverter from haystack.dataclasses import ByteStream test_data = [ { "year": "1997", "category": "literature", "laureates": [ { "id": "674", "firstname": "Dario", "surname": "Fokin", "motivation": "who emulates the jesters of the Middle Ages in scourging authority and upholding the " "dignity of the downtrodden", "share": "1", } ], }, { "year": "1986", "category": "medicine", "laureates": [ { "id": "434", "firstname": "Stanley", "surname": "Cohen", "motivation": "for their discoveries of growth factors", "share": "2", }, { "id": "435", "firstname": "Rita", "surname": "Levi-Montalcini", "motivation": "for their discoveries of growth factors", "share": "2", }, ], }, { "year": "1938", "category": "physics", "laureates": [ { "id": "46", "firstname": "Enrico", "surname": "Fermi", "motivation": "for his demonstrations of the existence of new radioactive elements produced by neutron " "irradiation, and for his related discovery of nuclear reactions brought about by slow " "neutrons", "share": "1", } ], }, ] def test_init_without_jq_schema_and_content_key(): with pytest.raises( ValueError, match="No `jq_schema` nor `content_key` specified. Set either or both to extract data." ): JSONConverter() @patch("haystack.components.converters.json.jq_import") def test_init_without_jq_schema_and_missing_dependency(jq_import): converter = JSONConverter(content_key="foo") jq_import.check.assert_not_called() assert converter._jq_schema is None assert converter._content_key == "foo" assert converter._meta_fields is None @patch("haystack.components.converters.json.jq_import") def test_init_with_jq_schema_and_missing_dependency(jq_import): jq_import.check.side_effect = ImportError with pytest.raises(ImportError): JSONConverter(jq_schema=".laureates[].motivation") def test_init_with_jq_schema(): converter = JSONConverter(jq_schema=".") assert converter._jq_schema == "." assert converter._content_key is None assert converter._meta_fields is None def test_to_dict(): converter = JSONConverter( jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"} ) assert converter.to_dict() == { "type": "haystack.components.converters.json.JSONConverter", "init_parameters": { "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": {"firstname", "surname"}, "store_full_path": False, }, } def test_from_dict(): data = { "type": "haystack.components.converters.json.JSONConverter", "init_parameters": { "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": ["firstname", "surname"], "store_full_path": True, }, } converter = JSONConverter.from_dict(data) assert converter._jq_schema == ".laureates[]" assert converter._content_key == "motivation" assert converter._meta_fields == ["firstname", "surname"] def test_run(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation') result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {} def test_run_with_store_full_path_false(tmpdir): """ Test if the component runs correctly with store_full_path=False """ first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter( jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation', store_full_path=False ) result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": "first_test_file.json"} assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" assert result["documents"][1].meta == {"file_path": "second_test_file.json"} assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" assert result["documents"][2].meta == {"file_path": "second_test_file.json"} assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {} def test_run_with_non_json_file(tmpdir, caplog): test_file = Path(tmpdir / "test_file.md") test_file.write_text("This is not a JSON file.", "utf-8") sources = [test_file] converter = JSONConverter(".laureates | .motivation") caplog.clear() with caplog.at_level(logging.WARNING): result = converter.run(sources=sources) records = caplog.records assert len(records) == 1 assert ( records[0].msg == f"Failed to extract text from {test_file}. Skipping it. Error: parse error: Invalid numeric literal at " f"line 1, column 5" ) assert result == {"documents": []} def test_run_with_bad_filter(tmpdir, caplog): test_file = Path(tmpdir / "test_file.json") test_file.write_text(json.dumps(test_data[0]), "utf-8") sources = [test_file] converter = JSONConverter(".laureates | .motivation") caplog.clear() with caplog.at_level(logging.WARNING): result = converter.run(sources=sources) records = caplog.records assert len(records) == 1 assert ( records[0].msg == f'Failed to extract text from {test_file}. Skipping it. Error: Cannot index array with string "motivation"' ) assert result == {"documents": []} def test_run_with_bad_encoding(tmpdir, caplog): test_file = Path(tmpdir / "test_file.json") test_file.write_text(json.dumps(test_data[0]), "utf-16") sources = [test_file] converter = JSONConverter(".laureates") caplog.clear() with caplog.at_level(logging.WARNING): result = converter.run(sources=sources) records = caplog.records assert len(records) == 1 assert records[0].msg.startswith( f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte" ) assert result == {"documents": []} def test_run_with_single_meta(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] meta = {"creation_date": "1945-05-25T00:00:00"} converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation') result = converter.run(sources=sources, meta=meta) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == { "file_path": os.path.basename(first_test_file), "creation_date": "1945-05-25T00:00:00", } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" assert result["documents"][1].meta == { "file_path": os.path.basename(second_test_file), "creation_date": "1945-05-25T00:00:00", } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" assert result["documents"][2].meta == { "file_path": os.path.basename(second_test_file), "creation_date": "1945-05-25T00:00:00", } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {"creation_date": "1945-05-25T00:00:00"} def test_run_with_meta_list(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] meta = [ {"creation_date": "1945-05-25T00:00:00"}, {"creation_date": "1943-09-03T00:00:00"}, {"creation_date": "1989-11-09T00:00:00"}, ] converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation') result = converter.run(sources=sources, meta=meta) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == { "file_path": os.path.basename(first_test_file), "creation_date": "1945-05-25T00:00:00", } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" assert result["documents"][1].meta == { "file_path": os.path.basename(second_test_file), "creation_date": "1943-09-03T00:00:00", } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" assert result["documents"][2].meta == { "file_path": os.path.basename(second_test_file), "creation_date": "1943-09-03T00:00:00", } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {"creation_date": "1989-11-09T00:00:00"} def test_run_with_meta_list_of_differing_length(tmpdir): sources = ["random_file.json"] meta = [{}, {}] converter = JSONConverter(jq_schema=".") with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."): converter.run(sources=sources, meta=meta) def test_run_with_jq_schema_and_content_key(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation") result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {} def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter( jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"} ) result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == { "file_path": os.path.basename(first_test_file), "firstname": "Dario", "surname": "Fokin", } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { "file_path": os.path.basename(second_test_file), "firstname": "Stanley", "surname": "Cohen", } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { "file_path": os.path.basename(second_test_file), "firstname": "Rita", "surname": "Levi-Montalcini", } assert ( result["documents"][3].content == "for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " "reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {"firstname": "Enrico", "surname": "Fermi"} def test_run_with_content_key(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter(content_key="category") result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "medicine" assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {} def test_run_with_content_key_and_extra_meta_fields(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter(content_key="category", extra_meta_fields={"year"}) result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"} assert result["documents"][1].content == "medicine" assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {"year": "1938"} def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): first_test_file = Path(tmpdir / "first_test_file.json") second_test_file = Path(tmpdir / "second_test_file.json") first_test_file.write_text(json.dumps(test_data[0]), "utf-8") second_test_file.write_text(json.dumps(test_data[1]), "utf-8") byte_stream = ByteStream.from_string(json.dumps(test_data[2])) sources = [str(first_test_file), second_test_file, byte_stream] converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation", extra_meta_fields="*") result = converter.run(sources=sources) assert len(result) == 1 assert len(result["documents"]) == 4 assert ( result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the " "downtrodden" ) assert result["documents"][0].meta == { "file_path": os.path.basename(first_test_file), "id": "674", "firstname": "Dario", "surname": "Fokin", "share": "1", } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { "file_path": os.path.basename(second_test_file), "id": "434", "firstname": "Stanley", "surname": "Cohen", "share": "2", } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { "file_path": os.path.basename(second_test_file), "id": "435", "firstname": "Rita", "surname": "Levi-Montalcini", "share": "2", } assert ( result["documents"][3].content == "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, " "and for his related discovery of nuclear reactions brought about by slow neutrons" ) assert result["documents"][3].meta == {"id": "46", "firstname": "Enrico", "surname": "Fermi", "share": "1"}