fix: Fix JSONConverter to properly skip files that are not utf-8 encoded (#8775)

* Small fix * Add reno * Trying out license header fix here
2025-11-30 17:06:08 +00:00 · 2025-01-28 01:29:55 -08:00 · 2025-01-28 01:29:55 -08:00 · bba84e5517
commit bba84e5517
parent e3dc164625
3 changed files with 24 additions and 0 deletions
--- a/haystack/components/converters/json.py
+++ b/haystack/components/converters/json.py
@ -194,6 +194,7 @@ class JSONConverter:
                source=source.meta["file_path"],
                error=exc,
            )
+            return []

        meta_fields = self._meta_fields or set()

--- a/releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml
+++ b/releasenotes/notes/fix-json-converter-non-utf8-3a755df732a8cbd5.yaml
@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Fixed JSONConverter to properly skip converting JSON files that are not utf-8 encoded.
--- a/test/components/converters/test_json.py
+++ b/test/components/converters/test_json.py
@ -236,6 +236,25 @@ def test_run_with_bad_filter(tmpdir, caplog):
    assert result == {"documents": []}


+def test_run_with_bad_encoding(tmpdir, caplog):
+    test_file = Path(tmpdir / "test_file.json")
+    test_file.write_text(json.dumps(test_data[0]), "utf-16")
+
+    sources = [test_file]
+    converter = JSONConverter(".laureates")
+
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        result = converter.run(sources=sources)
+
+    records = caplog.records
+    assert len(records) == 1
+    assert records[0].msg.startswith(
+        f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte"
+    )
+    assert result == {"documents": []}
+
+
 def test_run_with_single_meta(tmpdir):
    first_test_file = Path(tmpdir / "first_test_file.json")
    second_test_file = Path(tmpdir / "second_test_file.json")