mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
feat: Add staging brick for Datasaur token-based tasks (#50)
* feat: Add staging brick for Datasaur token-based tasks * Added doc string and formatting with flake8,mypy and black * docs: Added documentation for stage_for_datasaur * fix: version sync correction * fix: Corrections to docs fror stage_for_datasaur * fix: changes in naming of example variables * Update docs/source/bricks.rst Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
d5bd44bee4
commit
df16b5806b
@ -1,3 +1,7 @@
|
||||
## 0.2.2-dev0
|
||||
|
||||
* Add staging brick for Datasaur
|
||||
|
||||
## 0.2.1
|
||||
|
||||
* Added brick to convert an ISD dictionary to a list of elements
|
||||
|
||||
@ -750,3 +750,18 @@ files to an S3 bucket.
|
||||
|
||||
upload_staged_files()
|
||||
|
||||
``stage_for_datasaur``
|
||||
--------------------------
|
||||
Formats a list of ``Text`` elements as input to token based tasks in Datasaur.
|
||||
|
||||
Example:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.staging.datasaur import stage_for_datasaur
|
||||
elements = [Text("Text1"),Text("Text2")]
|
||||
datasaur_data = stage_for_datasaur(elements)
|
||||
|
||||
The output is a list of dictionaries, each one with two keys:
|
||||
"text" with the content of the element and
|
||||
"entities" with an empty list.
|
||||
14
test_unstructured/staging/test_datasaur.py
Normal file
14
test_unstructured/staging/test_datasaur.py
Normal file
@ -0,0 +1,14 @@
|
||||
import unstructured.staging.datasaur as datasaur
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
|
||||
def test_stage_for_datasaur():
|
||||
elements = [Text("Text 1"), Text("Text 2"), Text("Text 3")]
|
||||
result = datasaur.stage_for_datasaur(elements)
|
||||
assert result[0]["text"] == "Text 1"
|
||||
assert result[0]["entities"] == []
|
||||
assert result[1]["text"] == "Text 2"
|
||||
assert result[1]["entities"] == []
|
||||
assert result[2]["text"] == "Text 3"
|
||||
assert result[2]["entities"] == []
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.2.1" # pragma: no cover
|
||||
__version__ = "0.2.2-dev0" # pragma: no cover
|
||||
|
||||
12
unstructured/staging/datasaur.py
Normal file
12
unstructured/staging/datasaur.py
Normal file
@ -0,0 +1,12 @@
|
||||
from typing import Dict, List, Any
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
|
||||
def stage_for_datasaur(elements: List[Text]) -> List[Dict[str, Any]]:
|
||||
"""Convert a list of elements into a list of dictionaries for use in Datasaur"""
|
||||
result: List[Dict[str, Any]] = list()
|
||||
for item in elements:
|
||||
data = dict(text=item.text, entities=[])
|
||||
result.append(data)
|
||||
|
||||
return result
|
||||
Loading…
x
Reference in New Issue
Block a user