mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 15:42:19 +00:00
feat: add convert_to_dataframe staging brick (#127)
* add pandas to deps; pip-compile * staging brick to convert elements to dataframe * bump version * add convert_to_dataframe docs * bump wheel version * typo fix * typo fix 2!
This commit is contained in:
parent
445533745c
commit
17045aed80
@ -1,4 +1,4 @@
|
||||
## 0.3.5-dev5
|
||||
## 0.3.5-dev6
|
||||
|
||||
* Add new pattern to recognize plain text dash bullets
|
||||
* Add test for bullet patterns
|
||||
@ -8,6 +8,7 @@
|
||||
* Helper functions for identifying and extracting phone numbers
|
||||
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
||||
of an email.
|
||||
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
|
||||
|
||||
## 0.3.4
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
@ -22,6 +22,8 @@ idna==3.4
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.0.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
markupsafe==2.1.1
|
||||
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==1.26.12
|
||||
# via requests
|
||||
zipp==3.11.0
|
||||
# via importlib-metadata
|
||||
|
||||
@ -78,7 +78,7 @@ Examples:
|
||||
elements = partition_email(text=text)
|
||||
|
||||
``extract_attachment_info``
|
||||
----------------------
|
||||
----------------------------
|
||||
|
||||
The ``extract_attachment_info`` function takes an ``email.message.Message`` object
|
||||
as input and returns the a list of dictionaries containing the attachment information,
|
||||
@ -610,6 +610,24 @@ Examples:
|
||||
isd_csv = convert_to_isd_csv(elements)
|
||||
|
||||
|
||||
``convert_to_dataframe``
|
||||
------------------------
|
||||
|
||||
Converts a list of document ``Element`` objects to a ``pandas`` dataframe. The dataframe
|
||||
will have a ``text`` column with the text from the element and a ``type`` column
|
||||
indicating the element type, such as ``NarrativeText`` or ``Title``.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.documents.elements import Title, NarrativeText
|
||||
from unstructured.staging.base import convert_to_dataframe
|
||||
|
||||
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
|
||||
df = convert_to_dataframe(elements)
|
||||
|
||||
|
||||
``stage_for_transformers``
|
||||
--------------------------
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile --output-file=requirements/base.txt
|
||||
@ -45,13 +45,13 @@ openpyxl==3.0.10
|
||||
packaging==22.0
|
||||
# via argilla
|
||||
pandas==1.5.2
|
||||
# via argilla
|
||||
# via
|
||||
# argilla
|
||||
# unstructured (setup.py)
|
||||
pillow==9.4.0
|
||||
# via unstructured (setup.py)
|
||||
pydantic==1.10.2
|
||||
# via argilla
|
||||
pyparsing==3.0.9
|
||||
# via packaging
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
python-docx==0.8.11
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
@ -22,6 +22,8 @@ idna==3.4
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.0.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
markupsafe==2.1.1
|
||||
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==1.26.12
|
||||
# via requests
|
||||
zipp==3.11.0
|
||||
# via importlib-metadata
|
||||
|
||||
@ -4,3 +4,4 @@ pip-tools
|
||||
|
||||
# NOTE(robinson) - Required pins for security scans
|
||||
jupyter-core>=4.11.2
|
||||
wheel>=0.38.1
|
||||
|
||||
@ -1,9 +1,13 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements/dev.in
|
||||
#
|
||||
appnope==0.1.3
|
||||
# via
|
||||
# ipykernel
|
||||
# ipython
|
||||
argon2-cffi==21.3.0
|
||||
# via notebook
|
||||
argon2-cffi-bindings==21.2.0
|
||||
@ -36,6 +40,10 @@ executing==1.0.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.16.2
|
||||
# via nbformat
|
||||
importlib-metadata==6.0.0
|
||||
# via nbconvert
|
||||
importlib-resources==5.10.2
|
||||
# via jsonschema
|
||||
ipykernel==6.15.3
|
||||
# via
|
||||
# ipywidgets
|
||||
@ -45,7 +53,7 @@ ipykernel==6.15.3
|
||||
# qtconsole
|
||||
ipython==8.6.0
|
||||
# via
|
||||
# -r dev.in
|
||||
# -r requirements/dev.in
|
||||
# ipykernel
|
||||
# ipywidgets
|
||||
# jupyter-console
|
||||
@ -64,7 +72,7 @@ jinja2==3.1.2
|
||||
jsonschema==4.16.0
|
||||
# via nbformat
|
||||
jupyter==1.0.0
|
||||
# via -r dev.in
|
||||
# via -r requirements/dev.in
|
||||
jupyter-client==7.3.5
|
||||
# via
|
||||
# ipykernel
|
||||
@ -76,7 +84,7 @@ jupyter-console==6.4.4
|
||||
# via jupyter
|
||||
jupyter-core==5.1.0
|
||||
# via
|
||||
# -r dev.in
|
||||
# -r requirements/dev.in
|
||||
# jupyter-client
|
||||
# nbconvert
|
||||
# nbformat
|
||||
@ -134,7 +142,9 @@ pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
# via ipython
|
||||
pip-tools==6.12.1
|
||||
# via -r dev.in
|
||||
# via -r requirements/dev.in
|
||||
pkgutil-resolve-name==1.3.10
|
||||
# via jsonschema
|
||||
platformdirs==2.5.4
|
||||
# via jupyter-core
|
||||
prometheus-client==0.14.1
|
||||
@ -190,6 +200,10 @@ terminado==0.15.0
|
||||
# via notebook
|
||||
tinycss2==1.1.1
|
||||
# via nbconvert
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# build
|
||||
# pep517
|
||||
tornado==6.2
|
||||
# via
|
||||
# ipykernel
|
||||
@ -215,10 +229,16 @@ webencodings==0.5.1
|
||||
# via
|
||||
# bleach
|
||||
# tinycss2
|
||||
wheel==0.37.1
|
||||
# via pip-tools
|
||||
wheel==0.38.4
|
||||
# via
|
||||
# -r requirements/dev.in
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.3
|
||||
# via ipywidgets
|
||||
zipp==3.11.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# pip
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
|
||||
@ -66,13 +66,13 @@ packaging==22.0
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
pandas==1.5.2
|
||||
# via argilla
|
||||
# via
|
||||
# argilla
|
||||
# unstructured (setup.py)
|
||||
pillow==9.4.0
|
||||
# via unstructured (setup.py)
|
||||
pydantic==1.10.2
|
||||
# via argilla
|
||||
pyparsing==3.0.9
|
||||
# via packaging
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
python-docx==0.8.11
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements/test.in
|
||||
@ -7,23 +7,23 @@
|
||||
attrs==22.1.0
|
||||
# via pytest
|
||||
black==22.10.0
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# requests
|
||||
charset-normalizer==2.1.1
|
||||
# via requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# black
|
||||
coverage[toml]==6.4.4
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# pytest-cov
|
||||
flake8==5.0.4
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
idna==3.4
|
||||
# via
|
||||
# requests
|
||||
@ -31,7 +31,7 @@ idna==3.4
|
||||
iniconfig==1.1.1
|
||||
# via pytest
|
||||
label-studio-sdk==0.0.15
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
lxml==4.9.1
|
||||
# via label-studio-sdk
|
||||
mccabe==0.7.0
|
||||
@ -39,7 +39,7 @@ mccabe==0.7.0
|
||||
multidict==6.0.2
|
||||
# via yarl
|
||||
mypy==0.991
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
mypy-extensions==0.4.3
|
||||
# via
|
||||
# black
|
||||
@ -65,7 +65,7 @@ pyparsing==3.0.9
|
||||
pytest==7.1.3
|
||||
# via pytest-cov
|
||||
pytest-cov==4.0.0
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
pyyaml==6.0
|
||||
# via vcrpy
|
||||
requests==2.28.1
|
||||
@ -73,15 +73,20 @@ requests==2.28.1
|
||||
six==1.16.0
|
||||
# via vcrpy
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
# via
|
||||
# black
|
||||
# coverage
|
||||
# mypy
|
||||
# pytest
|
||||
typing-extensions==4.3.0
|
||||
# via
|
||||
# black
|
||||
# mypy
|
||||
# pydantic
|
||||
urllib3==1.26.12
|
||||
# via requests
|
||||
vcrpy==4.2.1
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
wrapt==1.14.1
|
||||
# via vcrpy
|
||||
yarl==1.8.1
|
||||
|
||||
1
setup.py
1
setup.py
@ -52,6 +52,7 @@ setup(
|
||||
"lxml",
|
||||
"nltk",
|
||||
"openpyxl",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"python-docx",
|
||||
# NOTE(robinson) - The following dependencies are pinned
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import csv
|
||||
import os
|
||||
import pytest
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import unstructured.staging.base as base
|
||||
|
||||
@ -51,3 +53,15 @@ def test_convert_to_isd_csv(output_csv_file):
|
||||
with open(output_csv_file, "r") as csv_file:
|
||||
csv_rows = csv.DictReader(csv_file)
|
||||
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
|
||||
|
||||
|
||||
def test_convert_to_dataframe():
|
||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||
df = base.convert_to_dataframe(elements)
|
||||
expected_df = pd.DataFrame(
|
||||
{
|
||||
"type": ["Title", "NarrativeText"],
|
||||
"text": ["Title 1", "Narrative 1"],
|
||||
}
|
||||
)
|
||||
assert df.equals(expected_df) is True
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.5-dev5" # pragma: no cover
|
||||
__version__ = "0.3.5-dev6" # pragma: no cover
|
||||
|
||||
@ -2,6 +2,8 @@ import io
|
||||
import csv
|
||||
from typing import Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
|
||||
|
||||
|
||||
@ -43,3 +45,14 @@ def convert_to_isd_csv(elements: List[Text]) -> str:
|
||||
csv_writer.writeheader()
|
||||
csv_writer.writerows(rows)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def convert_to_dataframe(elements: List[Text]) -> pd.DataFrame:
|
||||
"""Converts document elements to a pandas DataFrame. The dataframe contains the
|
||||
following columns:
|
||||
text: the element text
|
||||
type: the text type (NarrativeText, Title, etc)
|
||||
"""
|
||||
csv_string = convert_to_isd_csv(elements)
|
||||
csv_string_io = io.StringIO(csv_string)
|
||||
return pd.read_csv(csv_string_io, sep=",")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user