feat: Implement utility functions for reading and writing .jsonl files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
asymness 2022-10-04 18:51:11 +05:00 committed by GitHub
parent a950559b94
commit 28a4ae985d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 72 additions and 17 deletions

View File

@ -1,5 +1,6 @@
## 0.2.1-dev3
## 0.2.1-dev4
* Added utility function for JSONL reading and writing
* Added staging brick for CSV format for Prodigy
* Added staging brick for Prodigy
* Added ability to upload LabelStudio annotations

View File

@ -477,6 +477,24 @@ Examples:
json.dump(prodigy_data, f, indent=4)
**Note**: Prodigy recommends ``.jsonl`` format for feeding data to API loaders. After running ``stage_for_prodigy``, you can
use the ``save_as_jsonl`` utility function to save the formatted data to a ``.jsonl`` file that is ready to be used with Prodigy.
.. code:: python
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.prodigy import stage_for_prodigy
from unstructured.utils import save_as_jsonl
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
metadata = [{"type": "title"}, {"type": "text"}]
prodigy_data = stage_for_prodigy(elements, metadata)
# The resulting jsonl file is ready to be used with Prodigy.
save_as_jsonl(prodigy_data, "prodigy.jsonl")
``stage_csv_for_prodigy``
--------------------------

View File

@ -1,15 +0,0 @@
[
{
"text": "Title",
"meta": {
"type": "text",
"id": "7e8cd2056da73a7fefb6cd91f4e5d199"
}
},
{
"text": "Narrative",
"meta": {
"id": "ed2e59c337d01185f388a4e9334d6f2e"
}
}
]

View File

@ -0,0 +1,38 @@
import os
import json
import pytest
import unstructured.utils as utils
@pytest.fixture
def input_data():
return [
{"text": "This is a sentence."},
{"text": "This is another sentence.", "meta": {"score": 0.1}},
]
@pytest.fixture
def output_jsonl_file(tmp_path):
return os.path.join(tmp_path, "output.jsonl")
@pytest.fixture
def input_jsonl_file(tmp_path, input_data):
file_path = os.path.join(tmp_path, "input.jsonl")
with open(file_path, "w+") as input_file:
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
return file_path
def test_save_as_jsonl(input_data, output_jsonl_file):
utils.save_as_jsonl(input_data, output_jsonl_file)
with open(output_jsonl_file, "r") as output_file:
file_data = [json.loads(line) for line in output_file]
assert file_data == input_data
def test_read_as_jsonl(input_jsonl_file, input_data):
file_data = utils.read_from_jsonl(input_jsonl_file)
assert file_data == input_data

View File

@ -1 +1 @@
__version__ = "0.2.1-dev3" # pragma: no cover
__version__ = "0.2.1-dev4" # pragma: no cover

13
unstructured/utils.py Normal file
View File

@ -0,0 +1,13 @@
from typing import List, Dict
import json
def save_as_jsonl(data: List[Dict], filename: str) -> None:
with open(filename, "w+") as output_file:
output_file.writelines((json.dumps(datum) + "\n" for datum in data))
def read_from_jsonl(filename: str) -> List[Dict]:
with open(filename, "r") as input_file:
return [json.loads(line) for line in input_file]