mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-03 12:27:04 +00:00
feat: Implement utility functions for reading and writing .jsonl
files (#22)
* Implement save_as_jsonl and read_from_jsonl utility functions * Add unit tests for save_as_jsonl and read_from_jsonl utility functions * Add example of using save_as_jsonl with prodigy staging brick * Bump version and update changelog * remove accidentally added prodigy json file * added "the" in jsonl description Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
a950559b94
commit
28a4ae985d
@ -1,5 +1,6 @@
|
||||
## 0.2.1-dev3
|
||||
## 0.2.1-dev4
|
||||
|
||||
* Added utility function for JSONL reading and writing
|
||||
* Added staging brick for CSV format for Prodigy
|
||||
* Added staging brick for Prodigy
|
||||
* Added ability to upload LabelStudio annotations
|
||||
|
@ -477,6 +477,24 @@ Examples:
|
||||
json.dump(prodigy_data, f, indent=4)
|
||||
|
||||
|
||||
**Note**: Prodigy recommends ``.jsonl`` format for feeding data to API loaders. After running ``stage_for_prodigy``, you can
|
||||
use the ``save_as_jsonl`` utility function to save the formatted data to a ``.jsonl`` file that is ready to be used with Prodigy.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.documents.elements import Title, NarrativeText
|
||||
from unstructured.staging.prodigy import stage_for_prodigy
|
||||
from unstructured.utils import save_as_jsonl
|
||||
|
||||
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
|
||||
metadata = [{"type": "title"}, {"type": "text"}]
|
||||
prodigy_data = stage_for_prodigy(elements, metadata)
|
||||
|
||||
# The resulting jsonl file is ready to be used with Prodigy.
|
||||
save_as_jsonl(prodigy_data, "prodigy.jsonl")
|
||||
|
||||
|
||||
|
||||
``stage_csv_for_prodigy``
|
||||
--------------------------
|
||||
|
||||
|
15
prodigy.json
15
prodigy.json
@ -1,15 +0,0 @@
|
||||
[
|
||||
{
|
||||
"text": "Title",
|
||||
"meta": {
|
||||
"type": "text",
|
||||
"id": "7e8cd2056da73a7fefb6cd91f4e5d199"
|
||||
}
|
||||
},
|
||||
{
|
||||
"text": "Narrative",
|
||||
"meta": {
|
||||
"id": "ed2e59c337d01185f388a4e9334d6f2e"
|
||||
}
|
||||
}
|
||||
]
|
38
test_unstructured/test_utils.py
Normal file
38
test_unstructured/test_utils.py
Normal file
@ -0,0 +1,38 @@
|
||||
import os
|
||||
import json
|
||||
import pytest
|
||||
|
||||
import unstructured.utils as utils
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_data():
|
||||
return [
|
||||
{"text": "This is a sentence."},
|
||||
{"text": "This is another sentence.", "meta": {"score": 0.1}},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def output_jsonl_file(tmp_path):
|
||||
return os.path.join(tmp_path, "output.jsonl")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def input_jsonl_file(tmp_path, input_data):
|
||||
file_path = os.path.join(tmp_path, "input.jsonl")
|
||||
with open(file_path, "w+") as input_file:
|
||||
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
|
||||
return file_path
|
||||
|
||||
|
||||
def test_save_as_jsonl(input_data, output_jsonl_file):
|
||||
utils.save_as_jsonl(input_data, output_jsonl_file)
|
||||
with open(output_jsonl_file, "r") as output_file:
|
||||
file_data = [json.loads(line) for line in output_file]
|
||||
assert file_data == input_data
|
||||
|
||||
|
||||
def test_read_as_jsonl(input_jsonl_file, input_data):
|
||||
file_data = utils.read_from_jsonl(input_jsonl_file)
|
||||
assert file_data == input_data
|
@ -1 +1 @@
|
||||
__version__ = "0.2.1-dev3" # pragma: no cover
|
||||
__version__ = "0.2.1-dev4" # pragma: no cover
|
||||
|
13
unstructured/utils.py
Normal file
13
unstructured/utils.py
Normal file
@ -0,0 +1,13 @@
|
||||
from typing import List, Dict
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def save_as_jsonl(data: List[Dict], filename: str) -> None:
|
||||
with open(filename, "w+") as output_file:
|
||||
output_file.writelines((json.dumps(datum) + "\n" for datum in data))
|
||||
|
||||
|
||||
def read_from_jsonl(filename: str) -> List[Dict]:
|
||||
with open(filename, "r") as input_file:
|
||||
return [json.loads(line) for line in input_file]
|
Loading…
x
Reference in New Issue
Block a user