Sebastian Husch Lee bba84e5517
fix: Fix JSONConverter to properly skip files that are not utf-8 encoded (#8775)
* Small fix

* Add reno

* Trying out license header fix here
2025-01-28 10:29:55 +01:00

515 lines
21 KiB
Python

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import json
import os
from unittest.mock import patch
from pathlib import Path
import logging
import pytest
from haystack.components.converters import JSONConverter
from haystack.dataclasses import ByteStream
test_data = [
{
"year": "1997",
"category": "literature",
"laureates": [
{
"id": "674",
"firstname": "Dario",
"surname": "Fokin",
"motivation": "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden",
"share": "1",
}
],
},
{
"year": "1986",
"category": "medicine",
"laureates": [
{
"id": "434",
"firstname": "Stanley",
"surname": "Cohen",
"motivation": "for their discoveries of growth factors",
"share": "2",
},
{
"id": "435",
"firstname": "Rita",
"surname": "Levi-Montalcini",
"motivation": "for their discoveries of growth factors",
"share": "2",
},
],
},
{
"year": "1938",
"category": "physics",
"laureates": [
{
"id": "46",
"firstname": "Enrico",
"surname": "Fermi",
"motivation": "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, and for his related discovery of nuclear reactions brought about by slow neutrons",
"share": "1",
}
],
},
]
def test_init_without_jq_schema_and_content_key():
with pytest.raises(
ValueError, match="No `jq_schema` nor `content_key` specified. Set either or both to extract data."
):
JSONConverter()
@patch("haystack.components.converters.json.jq_import")
def test_init_without_jq_schema_and_missing_dependency(jq_import):
converter = JSONConverter(content_key="foo")
jq_import.check.assert_not_called()
assert converter._jq_schema is None
assert converter._content_key == "foo"
assert converter._meta_fields is None
@patch("haystack.components.converters.json.jq_import")
def test_init_with_jq_schema_and_missing_dependency(jq_import):
jq_import.check.side_effect = ImportError
with pytest.raises(ImportError):
JSONConverter(jq_schema=".laureates[].motivation")
def test_init_with_jq_schema():
converter = JSONConverter(jq_schema=".")
assert converter._jq_schema == "."
assert converter._content_key is None
assert converter._meta_fields is None
def test_to_dict():
converter = JSONConverter(
jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
)
assert converter.to_dict() == {
"type": "haystack.components.converters.json.JSONConverter",
"init_parameters": {
"content_key": "motivation",
"jq_schema": ".laureates[]",
"extra_meta_fields": {"firstname", "surname"},
"store_full_path": False,
},
}
def test_from_dict():
data = {
"type": "haystack.components.converters.json.JSONConverter",
"init_parameters": {
"content_key": "motivation",
"jq_schema": ".laureates[]",
"extra_meta_fields": ["firstname", "surname"],
"store_full_path": True,
},
}
converter = JSONConverter.from_dict(data)
assert converter._jq_schema == ".laureates[]"
assert converter._content_key == "motivation"
assert converter._meta_fields == ["firstname", "surname"]
def test_run(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {}
def test_run_with_store_full_path_false(tmpdir):
"""
Test if the component runs correctly with store_full_path=False
"""
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(
jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation', store_full_path=False
)
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": "first_test_file.json"}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": "second_test_file.json"}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": "second_test_file.json"}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {}
def test_run_with_non_json_file(tmpdir, caplog):
test_file = Path(tmpdir / "test_file.md")
test_file.write_text("This is not a JSON file.", "utf-8")
sources = [test_file]
converter = JSONConverter(".laureates | .motivation")
caplog.clear()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)
records = caplog.records
assert len(records) == 1
assert (
records[0].msg
== f"Failed to extract text from {test_file}. Skipping it. Error: parse error: Invalid numeric literal at line 1, column 5"
)
assert result == {"documents": []}
def test_run_with_bad_filter(tmpdir, caplog):
test_file = Path(tmpdir / "test_file.json")
test_file.write_text(json.dumps(test_data[0]), "utf-8")
sources = [test_file]
converter = JSONConverter(".laureates | .motivation")
caplog.clear()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)
records = caplog.records
assert len(records) == 1
assert (
records[0].msg
== f'Failed to extract text from {test_file}. Skipping it. Error: Cannot index array with string "motivation"'
)
assert result == {"documents": []}
def test_run_with_bad_encoding(tmpdir, caplog):
test_file = Path(tmpdir / "test_file.json")
test_file.write_text(json.dumps(test_data[0]), "utf-16")
sources = [test_file]
converter = JSONConverter(".laureates")
caplog.clear()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)
records = caplog.records
assert len(records) == 1
assert records[0].msg.startswith(
f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte"
)
assert result == {"documents": []}
def test_run_with_single_meta(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
meta = {"creation_date": "1945-05-25T00:00:00"}
converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
result = converter.run(sources=sources, meta=meta)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {"creation_date": "1945-05-25T00:00:00"}
def test_run_with_meta_list(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
meta = [
{"creation_date": "1945-05-25T00:00:00"},
{"creation_date": "1943-09-03T00:00:00"},
{"creation_date": "1989-11-09T00:00:00"},
]
converter = JSONConverter(jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation')
result = converter.run(sources=sources, meta=meta)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content
== "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"creation_date": "1945-05-25T00:00:00",
}
assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1943-09-03T00:00:00",
}
assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"creation_date": "1943-09-03T00:00:00",
}
assert (
result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {"creation_date": "1989-11-09T00:00:00"}
def test_run_with_meta_list_of_differing_length(tmpdir):
sources = ["random_file.json"]
meta = [{}, {}]
converter = JSONConverter(jq_schema=".")
with pytest.raises(ValueError, match="The length of the metadata list must match the number of sources."):
converter.run(sources=sources, meta=meta)
def test_run_with_jq_schema_and_content_key(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation")
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)}
assert (
result["documents"][3].content == "for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {}
def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(
jq_schema=".laureates[]", content_key="motivation", extra_meta_fields={"firstname", "surname"}
)
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and "
"upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"firstname": "Dario",
"surname": "Fokin",
}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"firstname": "Stanley",
"surname": "Cohen",
}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"firstname": "Rita",
"surname": "Levi-Montalcini",
}
assert (
result["documents"][3].content == "for his demonstrations of the existence of new "
"radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
"reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {"firstname": "Enrico", "surname": "Fermi"}
def test_run_with_content_key(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(content_key="category")
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 3
assert result["documents"][0].content == "literature"
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)}
assert result["documents"][1].content == "medicine"
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)}
assert result["documents"][2].content == "physics"
assert result["documents"][2].meta == {}
def test_run_with_content_key_and_extra_meta_fields(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(content_key="category", extra_meta_fields={"year"})
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 3
assert result["documents"][0].content == "literature"
assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"}
assert result["documents"][1].content == "medicine"
assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"}
assert result["documents"][2].content == "physics"
assert result["documents"][2].meta == {"year": "1938"}
def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
first_test_file.write_text(json.dumps(test_data[0]), "utf-8")
second_test_file.write_text(json.dumps(test_data[1]), "utf-8")
byte_stream = ByteStream.from_string(json.dumps(test_data[2]))
sources = [str(first_test_file), second_test_file, byte_stream]
converter = JSONConverter(jq_schema=".laureates[]", content_key="motivation", extra_meta_fields="*")
result = converter.run(sources=sources)
assert len(result) == 1
assert len(result["documents"]) == 4
assert (
result["documents"][0].content
== "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden"
)
assert result["documents"][0].meta == {
"file_path": os.path.basename(first_test_file),
"id": "674",
"firstname": "Dario",
"surname": "Fokin",
"share": "1",
}
assert result["documents"][1].content == "for their discoveries of growth factors"
assert result["documents"][1].meta == {
"file_path": os.path.basename(second_test_file),
"id": "434",
"firstname": "Stanley",
"surname": "Cohen",
"share": "2",
}
assert result["documents"][2].content == "for their discoveries of growth factors"
assert result["documents"][2].meta == {
"file_path": os.path.basename(second_test_file),
"id": "435",
"firstname": "Rita",
"surname": "Levi-Montalcini",
"share": "2",
}
assert (
result["documents"][3].content
== "for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, "
"and for his related discovery of nuclear reactions brought about by slow neutrons"
)
assert result["documents"][3].meta == {"id": "46", "firstname": "Enrico", "surname": "Fermi", "share": "1"}