2024-09-25 12:34:51 +02:00
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import json
from unittest . mock import patch
from pathlib import Path
import logging
import pytest
from haystack . components . converters import JSONConverter
from haystack . dataclasses import ByteStream
test_data = [
{
" year " : " 1997 " ,
" category " : " literature " ,
" laureates " : [
{
" id " : " 674 " ,
" firstname " : " Dario " ,
2024-11-25 15:22:19 +05:00
" surname " : " Fokin " ,
2024-09-25 12:34:51 +02:00
" motivation " : " who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden " ,
" share " : " 1 " ,
}
] ,
} ,
{
" year " : " 1986 " ,
" category " : " medicine " ,
" laureates " : [
{
" id " : " 434 " ,
" firstname " : " Stanley " ,
" surname " : " Cohen " ,
" motivation " : " for their discoveries of growth factors " ,
" share " : " 2 " ,
} ,
{
" id " : " 435 " ,
" firstname " : " Rita " ,
" surname " : " Levi-Montalcini " ,
" motivation " : " for their discoveries of growth factors " ,
" share " : " 2 " ,
} ,
] ,
} ,
{
" year " : " 1938 " ,
" category " : " physics " ,
" laureates " : [
{
" id " : " 46 " ,
" firstname " : " Enrico " ,
" surname " : " Fermi " ,
" motivation " : " for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, and for his related discovery of nuclear reactions brought about by slow neutrons " ,
" share " : " 1 " ,
}
] ,
} ,
]
def test_init_without_jq_schema_and_content_key ( ) :
with pytest . raises (
ValueError , match = " No `jq_schema` nor `content_key` specified. Set either or both to extract data. "
) :
JSONConverter ( )
@patch ( " haystack.components.converters.json.jq_import " )
def test_init_without_jq_schema_and_missing_dependency ( jq_import ) :
converter = JSONConverter ( content_key = " foo " )
jq_import . check . assert_not_called ( )
assert converter . _jq_schema is None
assert converter . _content_key == " foo "
assert converter . _meta_fields is None
@patch ( " haystack.components.converters.json.jq_import " )
def test_init_with_jq_schema_and_missing_dependency ( jq_import ) :
jq_import . check . side_effect = ImportError
with pytest . raises ( ImportError ) :
JSONConverter ( jq_schema = " .laureates[].motivation " )
def test_init_with_jq_schema ( ) :
converter = JSONConverter ( jq_schema = " . " )
assert converter . _jq_schema == " . "
assert converter . _content_key is None
assert converter . _meta_fields is None
def test_to_dict ( ) :
converter = JSONConverter (
jq_schema = " .laureates[] " , content_key = " motivation " , extra_meta_fields = { " firstname " , " surname " }
)
assert converter . to_dict ( ) == {
" type " : " haystack.components.converters.json.JSONConverter " ,
" init_parameters " : {
" content_key " : " motivation " ,
" jq_schema " : " .laureates[] " ,
" extra_meta_fields " : { " firstname " , " surname " } ,
2024-11-25 15:22:19 +05:00
" store_full_path " : True ,
2024-09-25 12:34:51 +02:00
} ,
}
def test_from_dict ( ) :
data = {
" type " : " haystack.components.converters.json.JSONConverter " ,
" init_parameters " : {
" content_key " : " motivation " ,
" jq_schema " : " .laureates[] " ,
" extra_meta_fields " : [ " firstname " , " surname " ] ,
2024-11-25 15:22:19 +05:00
" store_full_path " : True ,
2024-09-25 12:34:51 +02:00
} ,
}
converter = JSONConverter . from_dict ( data )
assert converter . _jq_schema == " .laureates[] "
assert converter . _content_key == " motivation "
assert converter . _meta_fields == [ " firstname " , " surname " ]
def test_run ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter ( jq_schema = ' .laureates[] | .firstname + " " + .surname + " " + .motivation ' )
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content
2024-11-25 15:22:19 +05:00
== " Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
2024-09-25 12:34:51 +02:00
" upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) }
assert result [ " documents " ] [ 1 ] . content == " Stanley Cohen for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) }
assert result [ " documents " ] [ 2 ] . content == " Rita Levi-Montalcini for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == { " file_path " : str ( second_test_file ) }
assert (
result [ " documents " ] [ 3 ] . content == " Enrico Fermi for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { }
2024-11-25 15:22:19 +05:00
def test_run_with_store_full_path_false ( tmpdir ) :
"""
Test if the component runs correctly with store_full_path = False
"""
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter (
jq_schema = ' .laureates[] | .firstname + " " + .surname + " " + .motivation ' , store_full_path = False
)
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content
== " Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
" upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : " first_test_file.json " }
assert result [ " documents " ] [ 1 ] . content == " Stanley Cohen for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : " second_test_file.json " }
assert result [ " documents " ] [ 2 ] . content == " Rita Levi-Montalcini for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == { " file_path " : " second_test_file.json " }
assert (
result [ " documents " ] [ 3 ] . content == " Enrico Fermi for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { }
2024-09-25 12:34:51 +02:00
def test_run_with_non_json_file ( tmpdir , caplog ) :
test_file = Path ( tmpdir / " test_file.md " )
test_file . write_text ( " This is not a JSON file. " , " utf-8 " )
sources = [ test_file ]
converter = JSONConverter ( " .laureates | .motivation " )
caplog . clear ( )
with caplog . at_level ( logging . WARNING ) :
result = converter . run ( sources = sources )
records = caplog . records
assert len ( records ) == 1
assert (
records [ 0 ] . msg
== f " Failed to extract text from { test_file } . Skipping it. Error: parse error: Invalid numeric literal at line 1, column 5 "
)
assert result == { " documents " : [ ] }
def test_run_with_bad_filter ( tmpdir , caplog ) :
test_file = Path ( tmpdir / " test_file.json " )
test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
sources = [ test_file ]
converter = JSONConverter ( " .laureates | .motivation " )
caplog . clear ( )
with caplog . at_level ( logging . WARNING ) :
result = converter . run ( sources = sources )
records = caplog . records
assert len ( records ) == 1
assert (
records [ 0 ] . msg
== f ' Failed to extract text from { test_file } . Skipping it. Error: Cannot index array with string " motivation " '
)
assert result == { " documents " : [ ] }
def test_run_with_single_meta ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
meta = { " creation_date " : " 1945-05-25T00:00:00 " }
converter = JSONConverter ( jq_schema = ' .laureates[] | .firstname + " " + .surname + " " + .motivation ' )
result = converter . run ( sources = sources , meta = meta )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content
2024-11-25 15:22:19 +05:00
== " Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
2024-09-25 12:34:51 +02:00
" upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) , " creation_date " : " 1945-05-25T00:00:00 " }
assert result [ " documents " ] [ 1 ] . content == " Stanley Cohen for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) , " creation_date " : " 1945-05-25T00:00:00 " }
assert result [ " documents " ] [ 2 ] . content == " Rita Levi-Montalcini for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == { " file_path " : str ( second_test_file ) , " creation_date " : " 1945-05-25T00:00:00 " }
assert (
result [ " documents " ] [ 3 ] . content == " Enrico Fermi for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { " creation_date " : " 1945-05-25T00:00:00 " }
def test_run_with_meta_list ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
meta = [
{ " creation_date " : " 1945-05-25T00:00:00 " } ,
{ " creation_date " : " 1943-09-03T00:00:00 " } ,
{ " creation_date " : " 1989-11-09T00:00:00 " } ,
]
converter = JSONConverter ( jq_schema = ' .laureates[] | .firstname + " " + .surname + " " + .motivation ' )
result = converter . run ( sources = sources , meta = meta )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content
2024-11-25 15:22:19 +05:00
== " Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and "
2024-09-25 12:34:51 +02:00
" upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) , " creation_date " : " 1945-05-25T00:00:00 " }
assert result [ " documents " ] [ 1 ] . content == " Stanley Cohen for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) , " creation_date " : " 1943-09-03T00:00:00 " }
assert result [ " documents " ] [ 2 ] . content == " Rita Levi-Montalcini for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == { " file_path " : str ( second_test_file ) , " creation_date " : " 1943-09-03T00:00:00 " }
assert (
result [ " documents " ] [ 3 ] . content == " Enrico Fermi for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { " creation_date " : " 1989-11-09T00:00:00 " }
def test_run_with_meta_list_of_differing_length ( tmpdir ) :
sources = [ " random_file.json " ]
meta = [ { } , { } ]
converter = JSONConverter ( jq_schema = " . " )
with pytest . raises ( ValueError , match = " The length of the metadata list must match the number of sources. " ) :
converter . run ( sources = sources , meta = meta )
def test_run_with_jq_schema_and_content_key ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter ( jq_schema = " .laureates[] " , content_key = " motivation " )
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content == " who emulates the jesters of the Middle Ages in scourging authority and "
" upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) }
assert result [ " documents " ] [ 1 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) }
assert result [ " documents " ] [ 2 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == { " file_path " : str ( second_test_file ) }
assert (
result [ " documents " ] [ 3 ] . content == " for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { }
def test_run_with_jq_schema_content_key_and_extra_meta_fields ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter (
jq_schema = " .laureates[] " , content_key = " motivation " , extra_meta_fields = { " firstname " , " surname " }
)
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content == " who emulates the jesters of the Middle Ages in scourging authority and "
" upholding the dignity of the downtrodden "
)
2024-11-25 15:22:19 +05:00
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) , " firstname " : " Dario " , " surname " : " Fokin " }
2024-09-25 12:34:51 +02:00
assert result [ " documents " ] [ 1 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == {
" file_path " : str ( second_test_file ) ,
" firstname " : " Stanley " ,
" surname " : " Cohen " ,
}
assert result [ " documents " ] [ 2 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == {
" file_path " : str ( second_test_file ) ,
" firstname " : " Rita " ,
" surname " : " Levi-Montalcini " ,
}
assert (
result [ " documents " ] [ 3 ] . content == " for his demonstrations of the existence of new "
" radioactive elements produced by neutron irradiation, and for his related discovery of nuclear "
" reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { " firstname " : " Enrico " , " surname " : " Fermi " }
def test_run_with_content_key ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter ( content_key = " category " )
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 3
assert result [ " documents " ] [ 0 ] . content == " literature "
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) }
assert result [ " documents " ] [ 1 ] . content == " medicine "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) }
assert result [ " documents " ] [ 2 ] . content == " physics "
assert result [ " documents " ] [ 2 ] . meta == { }
def test_run_with_content_key_and_extra_meta_fields ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter ( content_key = " category " , extra_meta_fields = { " year " } )
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 3
assert result [ " documents " ] [ 0 ] . content == " literature "
assert result [ " documents " ] [ 0 ] . meta == { " file_path " : str ( first_test_file ) , " year " : " 1997 " }
assert result [ " documents " ] [ 1 ] . content == " medicine "
assert result [ " documents " ] [ 1 ] . meta == { " file_path " : str ( second_test_file ) , " year " : " 1986 " }
assert result [ " documents " ] [ 2 ] . content == " physics "
assert result [ " documents " ] [ 2 ] . meta == { " year " : " 1938 " }
def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal ( tmpdir ) :
first_test_file = Path ( tmpdir / " first_test_file.json " )
second_test_file = Path ( tmpdir / " second_test_file.json " )
first_test_file . write_text ( json . dumps ( test_data [ 0 ] ) , " utf-8 " )
second_test_file . write_text ( json . dumps ( test_data [ 1 ] ) , " utf-8 " )
byte_stream = ByteStream . from_string ( json . dumps ( test_data [ 2 ] ) )
sources = [ str ( first_test_file ) , second_test_file , byte_stream ]
converter = JSONConverter ( jq_schema = " .laureates[] " , content_key = " motivation " , extra_meta_fields = " * " )
result = converter . run ( sources = sources )
assert len ( result ) == 1
assert len ( result [ " documents " ] ) == 4
assert (
result [ " documents " ] [ 0 ] . content
== " who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden "
)
assert result [ " documents " ] [ 0 ] . meta == {
" file_path " : str ( first_test_file ) ,
" id " : " 674 " ,
" firstname " : " Dario " ,
2024-11-25 15:22:19 +05:00
" surname " : " Fokin " ,
2024-09-25 12:34:51 +02:00
" share " : " 1 " ,
}
assert result [ " documents " ] [ 1 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 1 ] . meta == {
" file_path " : str ( second_test_file ) ,
" id " : " 434 " ,
" firstname " : " Stanley " ,
" surname " : " Cohen " ,
" share " : " 2 " ,
}
assert result [ " documents " ] [ 2 ] . content == " for their discoveries of growth factors "
assert result [ " documents " ] [ 2 ] . meta == {
" file_path " : str ( second_test_file ) ,
" id " : " 435 " ,
" firstname " : " Rita " ,
" surname " : " Levi-Montalcini " ,
" share " : " 2 " ,
}
assert (
result [ " documents " ] [ 3 ] . content
== " for his demonstrations of the existence of new radioactive elements produced by neutron irradiation, "
" and for his related discovery of nuclear reactions brought about by slow neutrons "
)
assert result [ " documents " ] [ 3 ] . meta == { " id " : " 46 " , " firstname " : " Enrico " , " surname " : " Fermi " , " share " : " 1 " }