mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
feat: stage_for_baseplate function (#546)
* added a staging brick for baseplate * added a test for baseplate * update documentation * version and changelog
This commit is contained in:
parent
aa01cdfc7a
commit
981805e435
@ -1,19 +1,18 @@
|
||||
## 0.6.3-dev1
|
||||
## 0.6.3-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
|
||||
API call.
|
||||
* Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
|
||||
|
||||
### Fixes
|
||||
|
||||
* Updates the grouping logic in the `partition_pdf` fast strategy to group together text
|
||||
in the same bounding box.
|
||||
|
||||
|
||||
## 0.6.2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -1435,6 +1435,61 @@ See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full l
|
||||
for labels and annotations.
|
||||
|
||||
|
||||
``stage_for_baseplate``
|
||||
-----------------------
|
||||
|
||||
The ``stage_for_baseplate`` staging function prepares a list of ``Element`` objects for ingestion
|
||||
into `Baseplate <https://docs.baseplate.ai/introduction>`_, an LLM backend with a spreadsheet interface.
|
||||
After running the ``stage_for_baseplate`` function, you can use the
|
||||
`Baseplate API <https://docs.baseplate.ai/api-reference/documents/upsert-data-rows>`_ to upload the documents
|
||||
to Baseplate. The following example code shows how to use the ``stage_for_baseplate`` function.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
|
||||
from unstructured.staging.baseplate import stage_for_baseplate
|
||||
|
||||
metadata = ElementMetadata(filename="fox.epub")
|
||||
|
||||
elements = [
|
||||
Title("A Wonderful Story About A Fox", metadata=metadata),
|
||||
NarrativeText(
|
||||
"A fox ran into the chicken coop and the chickens flew off!",
|
||||
metadata=metadata,
|
||||
),
|
||||
]
|
||||
|
||||
rows = stage_for_baseplate(elements)
|
||||
|
||||
The output will look like:
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
"rows": [
|
||||
{
|
||||
"data": {
|
||||
"element_id": "ad270eefd1cc68d15f4d3e51666d4dc8",
|
||||
"coordinates": None,
|
||||
"text": "A Wonderful Story About A Fox",
|
||||
"type": "Title",
|
||||
},
|
||||
"metadata": {"filename": "fox.epub"},
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"element_id": "8275769fdd1804f9f2b55ad3c9b0ef1b",
|
||||
"coordinates": None,
|
||||
"text": "A fox ran into the chicken coop and the chickens flew off!",
|
||||
"type": "NarrativeText",
|
||||
},
|
||||
"metadata": {"filename": "fox.epub"},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
|
||||
``stage_for_prodigy``
|
||||
--------------------------
|
||||
|
||||
|
||||
@ -11,6 +11,15 @@ which take a list of ``Element`` objects as input and return formatted dictionar
|
||||
You can convert a list of ``Text`` elements to an `Argilla <https://www.argilla.io/>`_ ``Dataset`` using the `stage_for_argilla <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-argilla>`_ staging brick. Specify the type of dataset to be generated using the ``argilla_task`` parameter. Valid values are ``"text_classification"``, ``"token_classification"``, and ``"text2text"``. Follow the link for more details on usage.
|
||||
|
||||
|
||||
``Integration with Baseplate``
|
||||
-------------------------------
|
||||
`Baseplate <https://docs.baseplate.ai/introduction>`_ is a backend optimized for use with LLMs that has an easy to use spreadsheet
|
||||
interface. The ``unstructured`` library offers a staging brick to convert a list of ``Element`` objects into the
|
||||
`rows format <https://docs.baseplate.ai/api-reference/documents/overview>`_ required by the Baseplate API. See the
|
||||
`stage_for_baseplate <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-baseplate>`_ documentation for
|
||||
information on how to stage elements for ingestion into Baseplate.
|
||||
|
||||
|
||||
``Integration with Datasaur``
|
||||
------------------------------
|
||||
You can format a list of ``Text`` elements as input to token based tasks in `Datasaur <https://datasaur.ai/>`_ using the `stage_for_datasaur <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-datasaur>`_ staging brick. You will obtain a list of dictionaries indexed by the keys ``"text"`` with the content of the element, and ``"entities"`` with an empty list. Follow the link to learn how to customise your entities and for more details on usage.
|
||||
|
||||
37
test_unstructured/staging/test_baseplate.py
Normal file
37
test_unstructured/staging/test_baseplate.py
Normal file
@ -0,0 +1,37 @@
|
||||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
|
||||
from unstructured.staging.baseplate import stage_for_baseplate
|
||||
|
||||
|
||||
def test_stage_for_baseplate():
|
||||
metadata = ElementMetadata(filename="fox.epub")
|
||||
elements = [
|
||||
Title("A Wonderful Story About A Fox", metadata=metadata),
|
||||
NarrativeText(
|
||||
"A fox ran into the chicken coop and the chickens flew off!",
|
||||
metadata=metadata,
|
||||
),
|
||||
]
|
||||
|
||||
rows = stage_for_baseplate(elements)
|
||||
assert rows == {
|
||||
"rows": [
|
||||
{
|
||||
"data": {
|
||||
"element_id": "ad270eefd1cc68d15f4d3e51666d4dc8",
|
||||
"coordinates": None,
|
||||
"text": "A Wonderful Story About A Fox",
|
||||
"type": "Title",
|
||||
},
|
||||
"metadata": {"filename": "fox.epub"},
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"element_id": "8275769fdd1804f9f2b55ad3c9b0ef1b",
|
||||
"coordinates": None,
|
||||
"text": "A fox ran into the chicken coop and the chickens flew off!",
|
||||
"type": "NarrativeText",
|
||||
},
|
||||
"metadata": {"filename": "fox.epub"},
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.3-dev1" # pragma: no cover
|
||||
__version__ = "0.6.3-dev2" # pragma: no cover
|
||||
|
||||
45
unstructured/staging/baseplate.py
Normal file
45
unstructured/staging/baseplate.py
Normal file
@ -0,0 +1,45 @@
|
||||
from typing import Dict, List, TypedDict
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
|
||||
class BaseplateRow(TypedDict):
|
||||
"""Typed dictionary for an individual Baseplate row. Baseplate docs show what the JSON
|
||||
representation should look like:
|
||||
https://docs.baseplate.ai/api-reference/documents/overview
|
||||
"""
|
||||
|
||||
data: Dict[str, str]
|
||||
metadata: Dict[str, str]
|
||||
|
||||
|
||||
class BaseplateRows(TypedDict):
|
||||
"""Typed dictionary for multiple Baseplate rows. Baseplate docs show what the JSON
|
||||
representation should look like. This is the JSON that is submitted to the Baseplate
|
||||
API to upload data.
|
||||
https://docs.baseplate.ai/api-reference/documents/overview
|
||||
"""
|
||||
|
||||
rows: List[BaseplateRow]
|
||||
|
||||
|
||||
def stage_for_baseplate(elements: List[Text]) -> BaseplateRows:
|
||||
"""Converts a list of unstructured elements into a dictionary of rows that can be uploaded
|
||||
into Baseplate via the API.
|
||||
|
||||
References
|
||||
----------
|
||||
https://docs.baseplate.ai/api-reference/documents/overview
|
||||
https://docs.baseplate.ai/api-reference/documents/upsert-data-rows
|
||||
"""
|
||||
rows: List[BaseplateRow] = []
|
||||
for element in elements:
|
||||
element_dict = element.to_dict()
|
||||
metadata = element_dict.pop("metadata")
|
||||
row: BaseplateRow = {
|
||||
"data": element_dict,
|
||||
"metadata": metadata,
|
||||
}
|
||||
rows.append(row)
|
||||
|
||||
return {"rows": rows}
|
||||
Loading…
x
Reference in New Issue
Block a user