feat: add convert_to_dataframe staging brick (#127)

* add pandas to deps; pip-compile * staging brick to convert elements to dataframe * bump version * add convert_to_dataframe docs * bump wheel version * typo fix * typo fix 2!
2025-12-24 13:44:05 +00:00 · 2023-01-04 12:04:59 -05:00 · 2023-01-04 12:04:59 -05:00 · 17045aed80
commit 17045aed80
parent 445533745c
13 changed files with 113 additions and 32 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.3.5-dev5
+## 0.3.5-dev6

 * Add new pattern to recognize plain text dash bullets
 * Add test for bullet patterns
@ -8,6 +8,7 @@
 * Helper functions for identifying and extracting phone numbers
 * Add new function `extract_attachment_info` that extracts and decode the attachment
 of an email.
+* Staging brick to convert a list of `Element`s to a `pandas` dataframe.

 ## 0.3.4

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile requirements/build.in
@ -22,6 +22,8 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
+importlib-metadata==6.0.0
+    # via sphinx
 jinja2==3.1.2
    # via sphinx
 markupsafe==2.1.1
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
 urllib3==1.26.12
    # via requests
+zipp==3.11.0
+    # via importlib-metadata
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -78,7 +78,7 @@ Examples:
  elements = partition_email(text=text)

 ``extract_attachment_info``
----------------------
+----------------------------

 The ``extract_attachment_info`` function takes an ``email.message.Message`` object
 as input and returns the a list of dictionaries containing the attachment information,
@ -610,6 +610,24 @@ Examples:
  isd_csv = convert_to_isd_csv(elements)


+``convert_to_dataframe``
+------------------------
+
+Converts a list of document ``Element`` objects to a ``pandas`` dataframe. The dataframe
+will have a ``text`` column with the text from the element and a ``type`` column
+indicating the element type, such as ``NarrativeText`` or ``Title``.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.documents.elements import Title, NarrativeText
+  from unstructured.staging.base import convert_to_dataframe
+
+  elements = [Title(text="Title"), NarrativeText(text="Narrative")]
+  df = convert_to_dataframe(elements)
+
+
 ``stage_for_transformers``
 --------------------------

--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile --output-file=requirements/base.txt
@ -45,13 +45,13 @@ openpyxl==3.0.10
 packaging==22.0
    # via argilla
 pandas==1.5.2
-    # via argilla
+    # via
+    #   argilla
+    #   unstructured (setup.py)
 pillow==9.4.0
    # via unstructured (setup.py)
 pydantic==1.10.2
    # via argilla
-pyparsing==3.0.9
-    # via packaging
 python-dateutil==2.8.2
    # via pandas
 python-docx==0.8.11
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile requirements/build.in
@ -22,6 +22,8 @@ idna==3.4
    # via requests
 imagesize==1.4.1
    # via sphinx
+importlib-metadata==6.0.0
+    # via sphinx
 jinja2==3.1.2
    # via sphinx
 markupsafe==2.1.1
@ -58,3 +60,5 @@ sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
 urllib3==1.26.12
    # via requests
+zipp==3.11.0
+    # via importlib-metadata
--- a/requirements/dev.in
+++ b/requirements/dev.in
@ -4,3 +4,4 @@ pip-tools

 # NOTE(robinson) - Required pins for security scans
 jupyter-core>=4.11.2
+wheel>=0.38.1
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -1,9 +1,13 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile requirements/dev.in
 #
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
    # via notebook
 argon2-cffi-bindings==21.2.0
@ -36,6 +40,10 @@ executing==1.0.0
    # via stack-data
 fastjsonschema==2.16.2
    # via nbformat
+importlib-metadata==6.0.0
+    # via nbconvert
+importlib-resources==5.10.2
+    # via jsonschema
 ipykernel==6.15.3
    # via
    #   ipywidgets
@ -45,7 +53,7 @@ ipykernel==6.15.3
    #   qtconsole
 ipython==8.6.0
    # via
-    #   -r dev.in
+    #   -r requirements/dev.in
    #   ipykernel
    #   ipywidgets
    #   jupyter-console
@ -64,7 +72,7 @@ jinja2==3.1.2
 jsonschema==4.16.0
    # via nbformat
 jupyter==1.0.0
-    # via -r dev.in
+    # via -r requirements/dev.in
 jupyter-client==7.3.5
    # via
    #   ipykernel
@ -76,7 +84,7 @@ jupyter-console==6.4.4
    # via jupyter
 jupyter-core==5.1.0
    # via
-    #   -r dev.in
+    #   -r requirements/dev.in
    #   jupyter-client
    #   nbconvert
    #   nbformat
@ -134,7 +142,9 @@ pexpect==4.8.0
 pickleshare==0.7.5
    # via ipython
 pip-tools==6.12.1
-    # via -r dev.in
+    # via -r requirements/dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 platformdirs==2.5.4
    # via jupyter-core
 prometheus-client==0.14.1
@ -190,6 +200,10 @@ terminado==0.15.0
    # via notebook
 tinycss2==1.1.1
    # via nbconvert
+tomli==2.0.1
+    # via
+    #   build
+    #   pep517
 tornado==6.2
    # via
    #   ipykernel
@ -215,10 +229,16 @@ webencodings==0.5.1
    # via
    #   bleach
    #   tinycss2
-wheel==0.37.1
-    # via pip-tools
+wheel==0.38.4
+    # via
+    #   -r requirements/dev.in
+    #   pip-tools
 widgetsnbextension==4.0.3
    # via ipywidgets
+zipp==3.11.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources

 # The following packages are considered to be unsafe in a requirements file:
 # pip
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
@ -66,13 +66,13 @@ packaging==22.0
    #   huggingface-hub
    #   transformers
 pandas==1.5.2
-    # via argilla
+    # via
+    #   argilla
+    #   unstructured (setup.py)
 pillow==9.4.0
    # via unstructured (setup.py)
 pydantic==1.10.2
    # via argilla
-pyparsing==3.0.9
-    # via packaging
 python-dateutil==2.8.2
    # via pandas
 python-docx==0.8.11
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.10
+# This file is autogenerated by pip-compile with python 3.8
 # To update, run:
 #
 #    pip-compile requirements/test.in
@ -7,23 +7,23 @@
 attrs==22.1.0
    # via pytest
 black==22.10.0
-    # via -r test.in
+    # via -r requirements/test.in
 certifi==2022.12.7
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   requests
 charset-normalizer==2.1.1
    # via requests
 click==8.1.3
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   black
 coverage[toml]==6.4.4
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   pytest-cov
 flake8==5.0.4
-    # via -r test.in
+    # via -r requirements/test.in
 idna==3.4
    # via
    #   requests
@ -31,7 +31,7 @@ idna==3.4
 iniconfig==1.1.1
    # via pytest
 label-studio-sdk==0.0.15
-    # via -r test.in
+    # via -r requirements/test.in
 lxml==4.9.1
    # via label-studio-sdk
 mccabe==0.7.0
@ -39,7 +39,7 @@ mccabe==0.7.0
 multidict==6.0.2
    # via yarl
 mypy==0.991
-    # via -r test.in
+    # via -r requirements/test.in
 mypy-extensions==0.4.3
    # via
    #   black
@ -65,7 +65,7 @@ pyparsing==3.0.9
 pytest==7.1.3
    # via pytest-cov
 pytest-cov==4.0.0
-    # via -r test.in
+    # via -r requirements/test.in
 pyyaml==6.0
    # via vcrpy
 requests==2.28.1
@ -73,15 +73,20 @@ requests==2.28.1
 six==1.16.0
    # via vcrpy
 tomli==2.0.1
-    # via pytest
+    # via
+    #   black
+    #   coverage
+    #   mypy
+    #   pytest
 typing-extensions==4.3.0
    # via
+    #   black
    #   mypy
    #   pydantic
 urllib3==1.26.12
    # via requests
 vcrpy==4.2.1
-    # via -r test.in
+    # via -r requirements/test.in
 wrapt==1.14.1
    # via vcrpy
 yarl==1.8.1
--- a/setup.py
+++ b/setup.py
@ -52,6 +52,7 @@ setup(
        "lxml",
        "nltk",
        "openpyxl",
+        "pandas",
        "pillow",
        "python-docx",
        # NOTE(robinson) - The following dependencies are pinned
--- a/test_unstructured/staging/test_base_staging.py
+++ b/test_unstructured/staging/test_base_staging.py
@ -1,6 +1,8 @@
+import csv
 import os
 import pytest
-import csv
+
+import pandas as pd

 import unstructured.staging.base as base

@ -51,3 +53,15 @@ def test_convert_to_isd_csv(output_csv_file):
    with open(output_csv_file, "r") as csv_file:
        csv_rows = csv.DictReader(csv_file)
        assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
+
+
+def test_convert_to_dataframe():
+    elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
+    df = base.convert_to_dataframe(elements)
+    expected_df = pd.DataFrame(
+        {
+            "type": ["Title", "NarrativeText"],
+            "text": ["Title 1", "Narrative 1"],
+        }
+    )
+    assert df.equals(expected_df) is True
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.3.5-dev5"  # pragma: no cover
+__version__ = "0.3.5-dev6"  # pragma: no cover
--- a/unstructured/staging/base.py
+++ b/unstructured/staging/base.py
@ -2,6 +2,8 @@ import io
 import csv
 from typing import Dict, List

+import pandas as pd
+
 from unstructured.documents.elements import Text, NarrativeText, Title, ListItem


@ -43,3 +45,14 @@ def convert_to_isd_csv(elements: List[Text]) -> str:
        csv_writer.writeheader()
        csv_writer.writerows(rows)
        return buffer.getvalue()
+
+
+def convert_to_dataframe(elements: List[Text]) -> pd.DataFrame:
+    """Converts document elements to a pandas DataFrame. The dataframe contains the
+    following columns:
+        text: the element text
+        type: the text type (NarrativeText, Title, etc)
+    """
+    csv_string = convert_to_isd_csv(elements)
+    csv_string_io = io.StringIO(csv_string)
+    return pd.read_csv(csv_string_io, sep=",")