feat: add convert_to_dataframe staging brick (#127)

* add pandas to deps; pip-compile

* staging brick to convert elements to dataframe

* bump version

* add convert_to_dataframe docs

* bump wheel version

* typo fix

* typo fix 2!
This commit is contained in:
Matt Robinson 2023-01-04 12:04:59 -05:00 committed by GitHub
parent 445533745c
commit 17045aed80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 113 additions and 32 deletions

View File

@ -1,4 +1,4 @@
## 0.3.5-dev5
## 0.3.5-dev6
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
@ -8,6 +8,7 @@
* Helper functions for identifying and extracting phone numbers
* Add new function `extract_attachment_info` that extracts and decode the attachment
of an email.
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
## 0.3.4

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile requirements/build.in
@ -22,6 +22,8 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.0.0
# via sphinx
jinja2==3.1.2
# via sphinx
markupsafe==2.1.1
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==1.26.12
# via requests
zipp==3.11.0
# via importlib-metadata

View File

@ -78,7 +78,7 @@ Examples:
elements = partition_email(text=text)
``extract_attachment_info``
----------------------
----------------------------
The ``extract_attachment_info`` function takes an ``email.message.Message`` object
as input and returns the a list of dictionaries containing the attachment information,
@ -610,6 +610,24 @@ Examples:
isd_csv = convert_to_isd_csv(elements)
``convert_to_dataframe``
------------------------
Converts a list of document ``Element`` objects to a ``pandas`` dataframe. The dataframe
will have a ``text`` column with the text from the element and a ``type`` column
indicating the element type, such as ``NarrativeText`` or ``Title``.
Examples:
.. code:: python
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dataframe
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
df = convert_to_dataframe(elements)
``stage_for_transformers``
--------------------------

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile --output-file=requirements/base.txt
@ -45,13 +45,13 @@ openpyxl==3.0.10
packaging==22.0
# via argilla
pandas==1.5.2
# via argilla
# via
# argilla
# unstructured (setup.py)
pillow==9.4.0
# via unstructured (setup.py)
pydantic==1.10.2
# via argilla
pyparsing==3.0.9
# via packaging
python-dateutil==2.8.2
# via pandas
python-docx==0.8.11

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile requirements/build.in
@ -22,6 +22,8 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.0.0
# via sphinx
jinja2==3.1.2
# via sphinx
markupsafe==2.1.1
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==1.26.12
# via requests
zipp==3.11.0
# via importlib-metadata

View File

@ -4,3 +4,4 @@ pip-tools
# NOTE(robinson) - Required pins for security scans
jupyter-core>=4.11.2
wheel>=0.38.1

View File

@ -1,9 +1,13 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile requirements/dev.in
#
appnope==0.1.3
# via
# ipykernel
# ipython
argon2-cffi==21.3.0
# via notebook
argon2-cffi-bindings==21.2.0
@ -36,6 +40,10 @@ executing==1.0.0
# via stack-data
fastjsonschema==2.16.2
# via nbformat
importlib-metadata==6.0.0
# via nbconvert
importlib-resources==5.10.2
# via jsonschema
ipykernel==6.15.3
# via
# ipywidgets
@ -45,7 +53,7 @@ ipykernel==6.15.3
# qtconsole
ipython==8.6.0
# via
# -r dev.in
# -r requirements/dev.in
# ipykernel
# ipywidgets
# jupyter-console
@ -64,7 +72,7 @@ jinja2==3.1.2
jsonschema==4.16.0
# via nbformat
jupyter==1.0.0
# via -r dev.in
# via -r requirements/dev.in
jupyter-client==7.3.5
# via
# ipykernel
@ -76,7 +84,7 @@ jupyter-console==6.4.4
# via jupyter
jupyter-core==5.1.0
# via
# -r dev.in
# -r requirements/dev.in
# jupyter-client
# nbconvert
# nbformat
@ -134,7 +142,9 @@ pexpect==4.8.0
pickleshare==0.7.5
# via ipython
pip-tools==6.12.1
# via -r dev.in
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==2.5.4
# via jupyter-core
prometheus-client==0.14.1
@ -190,6 +200,10 @@ terminado==0.15.0
# via notebook
tinycss2==1.1.1
# via nbconvert
tomli==2.0.1
# via
# build
# pep517
tornado==6.2
# via
# ipykernel
@ -215,10 +229,16 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
wheel==0.37.1
# via pip-tools
wheel==0.38.4
# via
# -r requirements/dev.in
# pip-tools
widgetsnbextension==4.0.3
# via ipywidgets
zipp==3.11.0
# via
# importlib-metadata
# importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# pip

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
@ -66,13 +66,13 @@ packaging==22.0
# huggingface-hub
# transformers
pandas==1.5.2
# via argilla
# via
# argilla
# unstructured (setup.py)
pillow==9.4.0
# via unstructured (setup.py)
pydantic==1.10.2
# via argilla
pyparsing==3.0.9
# via packaging
python-dateutil==2.8.2
# via pandas
python-docx==0.8.11

View File

@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with python 3.10
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile requirements/test.in
@ -7,23 +7,23 @@
attrs==22.1.0
# via pytest
black==22.10.0
# via -r test.in
# via -r requirements/test.in
certifi==2022.12.7
# via
# -r test.in
# -r requirements/test.in
# requests
charset-normalizer==2.1.1
# via requests
click==8.1.3
# via
# -r test.in
# -r requirements/test.in
# black
coverage[toml]==6.4.4
# via
# -r test.in
# -r requirements/test.in
# pytest-cov
flake8==5.0.4
# via -r test.in
# via -r requirements/test.in
idna==3.4
# via
# requests
@ -31,7 +31,7 @@ idna==3.4
iniconfig==1.1.1
# via pytest
label-studio-sdk==0.0.15
# via -r test.in
# via -r requirements/test.in
lxml==4.9.1
# via label-studio-sdk
mccabe==0.7.0
@ -39,7 +39,7 @@ mccabe==0.7.0
multidict==6.0.2
# via yarl
mypy==0.991
# via -r test.in
# via -r requirements/test.in
mypy-extensions==0.4.3
# via
# black
@ -65,7 +65,7 @@ pyparsing==3.0.9
pytest==7.1.3
# via pytest-cov
pytest-cov==4.0.0
# via -r test.in
# via -r requirements/test.in
pyyaml==6.0
# via vcrpy
requests==2.28.1
@ -73,15 +73,20 @@ requests==2.28.1
six==1.16.0
# via vcrpy
tomli==2.0.1
# via pytest
# via
# black
# coverage
# mypy
# pytest
typing-extensions==4.3.0
# via
# black
# mypy
# pydantic
urllib3==1.26.12
# via requests
vcrpy==4.2.1
# via -r test.in
# via -r requirements/test.in
wrapt==1.14.1
# via vcrpy
yarl==1.8.1

View File

@ -52,6 +52,7 @@ setup(
"lxml",
"nltk",
"openpyxl",
"pandas",
"pillow",
"python-docx",
# NOTE(robinson) - The following dependencies are pinned

View File

@ -1,6 +1,8 @@
import csv
import os
import pytest
import csv
import pandas as pd
import unstructured.staging.base as base
@ -51,3 +53,15 @@ def test_convert_to_isd_csv(output_csv_file):
with open(output_csv_file, "r") as csv_file:
csv_rows = csv.DictReader(csv_file)
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
def test_convert_to_dataframe():
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
df = base.convert_to_dataframe(elements)
expected_df = pd.DataFrame(
{
"type": ["Title", "NarrativeText"],
"text": ["Title 1", "Narrative 1"],
}
)
assert df.equals(expected_df) is True

View File

@ -1 +1 @@
__version__ = "0.3.5-dev5" # pragma: no cover
__version__ = "0.3.5-dev6" # pragma: no cover

View File

@ -2,6 +2,8 @@ import io
import csv
from typing import Dict, List
import pandas as pd
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
@ -43,3 +45,14 @@ def convert_to_isd_csv(elements: List[Text]) -> str:
csv_writer.writeheader()
csv_writer.writerows(rows)
return buffer.getvalue()
def convert_to_dataframe(elements: List[Text]) -> pd.DataFrame:
"""Converts document elements to a pandas DataFrame. The dataframe contains the
following columns:
text: the element text
type: the text type (NarrativeText, Title, etc)
"""
csv_string = convert_to_isd_csv(elements)
csv_string_io = io.StringIO(csv_string)
return pd.read_csv(csv_string_io, sep=",")