diff --git a/CHANGELOG.md b/CHANGELOG.md index ea0ecc4c3..3dae75e83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.2.1-dev3 +## 0.2.1-dev4 +* Added utility function for JSONL reading and writing * Added staging brick for CSV format for Prodigy * Added staging brick for Prodigy * Added ability to upload LabelStudio annotations diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index a4678d7ce..74e3e8279 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -477,6 +477,24 @@ Examples: json.dump(prodigy_data, f, indent=4) +**Note**: Prodigy recommends ``.jsonl`` format for feeding data to API loaders. After running ``stage_for_prodigy``, you can +use the ``save_as_jsonl`` utility function to save the formatted data to a ``.jsonl`` file that is ready to be used with Prodigy. + +.. code:: python + + from unstructured.documents.elements import Title, NarrativeText + from unstructured.staging.prodigy import stage_for_prodigy + from unstructured.utils import save_as_jsonl + + elements = [Title(text="Title"), NarrativeText(text="Narrative")] + metadata = [{"type": "title"}, {"type": "text"}] + prodigy_data = stage_for_prodigy(elements, metadata) + + # The resulting jsonl file is ready to be used with Prodigy. + save_as_jsonl(prodigy_data, "prodigy.jsonl") + + + ``stage_csv_for_prodigy`` -------------------------- diff --git a/prodigy.json b/prodigy.json deleted file mode 100644 index e5edc5277..000000000 --- a/prodigy.json +++ /dev/null @@ -1,15 +0,0 @@ -[ - { - "text": "Title", - "meta": { - "type": "text", - "id": "7e8cd2056da73a7fefb6cd91f4e5d199" - } - }, - { - "text": "Narrative", - "meta": { - "id": "ed2e59c337d01185f388a4e9334d6f2e" - } - } -] \ No newline at end of file diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py new file mode 100644 index 000000000..d9d6d836d --- /dev/null +++ b/test_unstructured/test_utils.py @@ -0,0 +1,38 @@ +import os +import json +import pytest + +import unstructured.utils as utils + + +@pytest.fixture +def input_data(): + return [ + {"text": "This is a sentence."}, + {"text": "This is another sentence.", "meta": {"score": 0.1}}, + ] + + +@pytest.fixture +def output_jsonl_file(tmp_path): + return os.path.join(tmp_path, "output.jsonl") + + +@pytest.fixture +def input_jsonl_file(tmp_path, input_data): + file_path = os.path.join(tmp_path, "input.jsonl") + with open(file_path, "w+") as input_file: + input_file.writelines([json.dumps(obj) + "\n" for obj in input_data]) + return file_path + + +def test_save_as_jsonl(input_data, output_jsonl_file): + utils.save_as_jsonl(input_data, output_jsonl_file) + with open(output_jsonl_file, "r") as output_file: + file_data = [json.loads(line) for line in output_file] + assert file_data == input_data + + +def test_read_as_jsonl(input_jsonl_file, input_data): + file_data = utils.read_from_jsonl(input_jsonl_file) + assert file_data == input_data diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3cf7ddb67..be1d55cde 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1-dev3" # pragma: no cover +__version__ = "0.2.1-dev4" # pragma: no cover diff --git a/unstructured/utils.py b/unstructured/utils.py new file mode 100644 index 000000000..af2765c82 --- /dev/null +++ b/unstructured/utils.py @@ -0,0 +1,13 @@ +from typing import List, Dict + +import json + + +def save_as_jsonl(data: List[Dict], filename: str) -> None: + with open(filename, "w+") as output_file: + output_file.writelines((json.dumps(datum) + "\n" for datum in data)) + + +def read_from_jsonl(filename: str) -> List[Dict]: + with open(filename, "r") as input_file: + return [json.loads(line) for line in input_file]