mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-03 20:33:35 +00:00
feat: Implement utility functions for reading and writing .jsonl
files (#22)
* Implement save_as_jsonl and read_from_jsonl utility functions * Add unit tests for save_as_jsonl and read_from_jsonl utility functions * Add example of using save_as_jsonl with prodigy staging brick * Bump version and update changelog * remove accidentally added prodigy json file * added "the" in jsonl description Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
a950559b94
commit
28a4ae985d
@ -1,5 +1,6 @@
|
|||||||
## 0.2.1-dev3
|
## 0.2.1-dev4
|
||||||
|
|
||||||
|
* Added utility function for JSONL reading and writing
|
||||||
* Added staging brick for CSV format for Prodigy
|
* Added staging brick for CSV format for Prodigy
|
||||||
* Added staging brick for Prodigy
|
* Added staging brick for Prodigy
|
||||||
* Added ability to upload LabelStudio annotations
|
* Added ability to upload LabelStudio annotations
|
||||||
|
@ -477,6 +477,24 @@ Examples:
|
|||||||
json.dump(prodigy_data, f, indent=4)
|
json.dump(prodigy_data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
**Note**: Prodigy recommends ``.jsonl`` format for feeding data to API loaders. After running ``stage_for_prodigy``, you can
|
||||||
|
use the ``save_as_jsonl`` utility function to save the formatted data to a ``.jsonl`` file that is ready to be used with Prodigy.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Title, NarrativeText
|
||||||
|
from unstructured.staging.prodigy import stage_for_prodigy
|
||||||
|
from unstructured.utils import save_as_jsonl
|
||||||
|
|
||||||
|
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
|
||||||
|
metadata = [{"type": "title"}, {"type": "text"}]
|
||||||
|
prodigy_data = stage_for_prodigy(elements, metadata)
|
||||||
|
|
||||||
|
# The resulting jsonl file is ready to be used with Prodigy.
|
||||||
|
save_as_jsonl(prodigy_data, "prodigy.jsonl")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
``stage_csv_for_prodigy``
|
``stage_csv_for_prodigy``
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
15
prodigy.json
15
prodigy.json
@ -1,15 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"text": "Title",
|
|
||||||
"meta": {
|
|
||||||
"type": "text",
|
|
||||||
"id": "7e8cd2056da73a7fefb6cd91f4e5d199"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "Narrative",
|
|
||||||
"meta": {
|
|
||||||
"id": "ed2e59c337d01185f388a4e9334d6f2e"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
38
test_unstructured/test_utils.py
Normal file
38
test_unstructured/test_utils.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import unstructured.utils as utils
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def input_data():
|
||||||
|
return [
|
||||||
|
{"text": "This is a sentence."},
|
||||||
|
{"text": "This is another sentence.", "meta": {"score": 0.1}},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def output_jsonl_file(tmp_path):
|
||||||
|
return os.path.join(tmp_path, "output.jsonl")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def input_jsonl_file(tmp_path, input_data):
|
||||||
|
file_path = os.path.join(tmp_path, "input.jsonl")
|
||||||
|
with open(file_path, "w+") as input_file:
|
||||||
|
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_as_jsonl(input_data, output_jsonl_file):
|
||||||
|
utils.save_as_jsonl(input_data, output_jsonl_file)
|
||||||
|
with open(output_jsonl_file, "r") as output_file:
|
||||||
|
file_data = [json.loads(line) for line in output_file]
|
||||||
|
assert file_data == input_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_as_jsonl(input_jsonl_file, input_data):
|
||||||
|
file_data = utils.read_from_jsonl(input_jsonl_file)
|
||||||
|
assert file_data == input_data
|
@ -1 +1 @@
|
|||||||
__version__ = "0.2.1-dev3" # pragma: no cover
|
__version__ = "0.2.1-dev4" # pragma: no cover
|
||||||
|
13
unstructured/utils.py
Normal file
13
unstructured/utils.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def save_as_jsonl(data: List[Dict], filename: str) -> None:
|
||||||
|
with open(filename, "w+") as output_file:
|
||||||
|
output_file.writelines((json.dumps(datum) + "\n" for datum in data))
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_jsonl(filename: str) -> List[Dict]:
|
||||||
|
with open(filename, "r") as input_file:
|
||||||
|
return [json.loads(line) for line in input_file]
|
Loading…
x
Reference in New Issue
Block a user