fix: Fix JSONConverter to properly skip files that are not utf-8 encoded (#8775)

* Small fix

* Add reno

* Trying out license header fix here
This commit is contained in:
Sebastian Husch Lee 2025-01-28 01:29:55 -08:00 committed by GitHub
parent e3dc164625
commit bba84e5517
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 0 deletions

View File

@ -194,6 +194,7 @@ class JSONConverter:
source=source.meta["file_path"],
error=exc,
)
return []
meta_fields = self._meta_fields or set()

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Fixed JSONConverter to properly skip converting JSON files that are not utf-8 encoded.

View File

@ -236,6 +236,25 @@ def test_run_with_bad_filter(tmpdir, caplog):
assert result == {"documents": []}
def test_run_with_bad_encoding(tmpdir, caplog):
test_file = Path(tmpdir / "test_file.json")
test_file.write_text(json.dumps(test_data[0]), "utf-16")
sources = [test_file]
converter = JSONConverter(".laureates")
caplog.clear()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)
records = caplog.records
assert len(records) == 1
assert records[0].msg.startswith(
f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte"
)
assert result == {"documents": []}
def test_run_with_single_meta(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")