fix: improve false-positive Title elements on Chinese text (#3836)

**Summary** Improve element-type mapping for Chinese text. Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. Fixes #3084 --------- Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
2025-10-30 01:17:43 +00:00 · 2024-12-17 17:16:42 -08:00 · 2024-12-17 17:16:42 -08:00 · 9ece0b5ad2
commit 9ece0b5ad2
parent 9a9bf4c4f5
15 changed files with 847 additions and 852 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.16.12-dev3
+## 0.16.12-dev4
 ### Enhancements
@ -10,6 +10,7 @@
 - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
 - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
 - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
 ## 0.16.11
--- a/test_unstructured/metrics/test_element_type.py
+++ b/test_unstructured/metrics/test_element_type.py
@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
            "handbook-1p.docx",
            {
                ("Header", None): 1,
-                ("Title", 0): 1,
+                ("UncategorizedText", 0): 6,
                ("Title", 1): 1,
                ("Title", 2): 1,
                ("ListItem", 3): 3,
-                ("NarrativeText", 4): 7,
+                ("NarrativeText", 0): 7,
                ("Footer", None): 1,
            },
-            (0.43, 0.07, 0.65),
+            (0.78, 0.72, 0.81),
        ),
        (
            "handbook-1p.docx",
            {
                ("Header", None): 1,
-                ("Title", 0): 6,
+                ("UncategorizedText", 0): 6,
                ("NarrativeText", 0): 7,
                ("PageBreak", None): 1,
                ("Footer", None): 1,
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -1286,7 +1286,7 @@ def expected_docx_elements():
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
-        Title("Analysis"),
+        Text("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
--- a/test_unstructured/partition/test_doc.py
+++ b/test_unstructured/partition/test_doc.py
@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
-        Title("Analysis"),
+        Text("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
        Title("These are a few of my favorite things:"),
        ListItem("Parrots"),
        ListItem("Hockey"),
-        Title("Analysis"),
+        Text("Analysis"),
        NarrativeText("This is my first thought. This is my second thought."),
        NarrativeText("This is my third thought."),
        Text("2023"),
@ -1210,7 +1210,7 @@ class Describe_DocxPartitioner:
        opts_args["file_path"] = example_doc_path("page-breaks.docx")
        opts = DocxPartitionerOptions(**opts_args)
        expected = [
-            # NOTE(scanny) - -- page 1 --
+            # -- page 1 --
            NarrativeText(
                "First page, tab here:\t"
                "followed by line-break here:\n"
@ -1220,28 +1220,28 @@ class Describe_DocxPartitioner:
                "and hard page-break here>>"
            ),
            PageBreak(""),
-            # NOTE(scanny) - -- page 2 --
+            # -- page 2 --
            NarrativeText(
                "<<Text on second page. The font is big so it breaks onto third page--"
                "------------------here-->> <<but break falls inside link so text stays"
                " together."
            ),
            PageBreak(""),
-            # NOTE(scanny) - -- page 3 --
+            # -- page 3 --
            NarrativeText("Continuous section break here>>"),
            NarrativeText("<<followed by text on same page"),
            NarrativeText("Odd-page section break here>>"),
            PageBreak(""),
-            # NOTE(scanny) - -- page 4 --
+            # -- page 4 --
            PageBreak(""),
-            # NOTE(scanny) - -- page 5 --
+            # -- page 5 --
            NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
            NarrativeText(
                'Then text gets big again so a "natural" rendered page break happens again here>> '
            ),
            PageBreak(""),
-            # NOTE(scanny) - -- page 6 --
+            # -- page 6 --
-            Title("<<and then more text proceeds."),
+            Text("<<and then more text proceeds."),
        ]
        elements = _DocxPartitioner.iter_document_elements(opts)
--- a/test_unstructured/partition/test_odt.py
+++ b/test_unstructured/partition/test_odt.py
@ -23,7 +23,6 @@ from unstructured.documents.elements import (
    Table,
    TableChunk,
    Text,
    Title,
 )
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.odt import partition_odt
@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
    elements = partition_odt(example_doc_path("fake.odt"))
    assert elements == [
-        Title("Lorem ipsum dolor sit amet."),
+        Text("Lorem ipsum dolor sit amet."),
        Table(
            "Header row Mon Wed Fri"
            " Color Blue Red Green"
@ -63,7 +62,7 @@ def test_partition_odt_from_file():
        elements = partition_odt(file=f)
    assert elements == [
-        Title("Lorem ipsum dolor sit amet."),
+        Text("Lorem ipsum dolor sit amet."),
        Table(
            "Header row Mon Wed Fri"
            " Color Blue Red Green"
--- a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json
@ -23,7 +23,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "5209312022a75a31d95385fdccff68fa",
    "text": "CHAPTER 1",
    "metadata": {
@ -51,7 +51,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "22a23e29022f32945965002cd734a8f0",
    "text": "INTRODUCTION",
    "metadata": {
@ -79,7 +79,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "4c175cf543957acc4420221de28d3fca",
    "text": "CHAPTER 1 \u2013 INTRODUCTION",
    "metadata": {
@ -101,7 +101,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "77022a5264f552b223538977cd40f640",
    "text": "A.\tPURPOSE",
    "metadata": {
@ -189,7 +189,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "e341ffc123dd2827638aba18149c4175",
    "text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
    "metadata": {
@ -255,7 +255,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "1b11ebe52652656e0ed8c12e5969de9b",
    "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
    "metadata": {
--- a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json
@ -23,7 +23,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "5209312022a75a31d95385fdccff68fa",
    "text": "CHAPTER 1",
    "metadata": {
@ -51,7 +51,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "22a23e29022f32945965002cd734a8f0",
    "text": "INTRODUCTION",
    "metadata": {
@ -79,7 +79,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "4c175cf543957acc4420221de28d3fca",
    "text": "CHAPTER 1 \u2013 INTRODUCTION",
    "metadata": {
@ -101,7 +101,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "77022a5264f552b223538977cd40f640",
    "text": "A.\tPURPOSE",
    "metadata": {
@ -189,7 +189,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "e341ffc123dd2827638aba18149c4175",
    "text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
    "metadata": {
@ -255,7 +255,7 @@
    }
  },
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "1b11ebe52652656e0ed8c12e5969de9b",
    "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
    "metadata": {
--- a/test_unstructured_ingest/expected-structured-output/google-drive/fake.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/fake.docx.json
@ -1,6 +1,6 @@
 [
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "56d531394823d81787d77a04462ed096",
    "text": "Lorem ipsum dolor sit amet.",
    "metadata": {
@ -17,6 +17,13 @@
        "date_created": "1686809759.687",
        "date_modified": "1686809743.0",
        "permissions_data": [
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "18298851591250030956",
            "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -28,6 +35,17 @@
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "04774006893477068632",
            "displayName": "ryan",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
            "emailAddress": "ryan@unstructured.io",
            "role": "owner",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "09147371668407854156",
            "displayName": "roman",
@ -38,24 +56,6 @@
            "role": "writer",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "04774006893477068632",
            "displayName": "ryan",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
            "emailAddress": "ryan@unstructured.io",
            "role": "owner",
            "deleted": false,
            "pendingOwner": false
          }
        ]
      }
--- a/test_unstructured_ingest/expected-structured-output/google-drive/nested/fake.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/nested/fake.docx.json
@ -1,6 +1,6 @@
 [
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "56d531394823d81787d77a04462ed096",
    "text": "Lorem ipsum dolor sit amet.",
    "metadata": {
@ -17,6 +17,13 @@
        "date_created": "1718722775.76",
        "date_modified": "1718722788.018",
        "permissions_data": [
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "18298851591250030956",
            "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -39,13 +46,6 @@
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "09147371668407854156",
            "displayName": "roman",
--- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json
--- a/test_unstructured_ingest/expected-structured-output/google-drive/test-drive-doc.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/test-drive-doc.docx.json
@ -1,6 +1,6 @@
 [
  {
-    "type": "Title",
+    "type": "UncategorizedText",
    "element_id": "cc23ac9998df1db62b795ec4e5133ab0",
    "text": "Title",
    "metadata": {
@ -22,6 +22,13 @@
        "date_created": "1686809758.931",
        "date_modified": "1686809744.0",
        "permissions_data": [
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "18298851591250030956",
            "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -33,24 +40,6 @@
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "09147371668407854156",
            "displayName": "roman",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
            "emailAddress": "roman@unstructured.io",
            "role": "writer",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "04774006893477068632",
            "displayName": "ryan",
@ -61,6 +50,17 @@
            "role": "owner",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "09147371668407854156",
            "displayName": "roman",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
            "emailAddress": "roman@unstructured.io",
            "role": "writer",
            "deleted": false,
            "pendingOwner": false
          }
        ]
      }
@ -89,6 +89,13 @@
        "date_created": "1686809758.931",
        "date_modified": "1686809744.0",
        "permissions_data": [
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "18298851591250030956",
            "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -100,6 +107,17 @@
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "04774006893477068632",
            "displayName": "ryan",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
            "emailAddress": "ryan@unstructured.io",
            "role": "owner",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "09147371668407854156",
            "displayName": "roman",
@ -110,24 +128,6 @@
            "role": "writer",
            "deleted": false,
            "pendingOwner": false
          },
          {
            "id": "anyoneWithLink",
            "type": "anyone",
            "kind": "drive#permission",
            "role": "reader",
            "allowFileDiscovery": false
          },
          {
            "id": "04774006893477068632",
            "displayName": "ryan",
            "type": "user",
            "kind": "drive#permission",
            "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
            "emailAddress": "ryan@unstructured.io",
            "role": "owner",
            "deleted": false,
            "pendingOwner": false
          }
        ]
      }
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json
@ -27,7 +27,7 @@
        "eng"
      ],
      "page_number": 1,
-      "orig_elements": "eJztlU1v2jAYx79K5PMgifNm71Z1leilIAqnFkWO/RiiJnbkOBsd4rvPhtK1E4dt0qQx7RQ/74//P1l52CFooAVly1qgjwFKODBKuIRIZAXJc8oqwDxNMkllFgmGPgSoBcsEs8zl75A/lL0eDIeD3YFp676vterLl6SHHWq18OEkiQnZr1wPA1wbUTaaM6vNsZLZjV8h3OgWQjMoBSb8os1TOKjemoHbwYB4b8CWtV0DI6F5H26YEpXWT6O4GzvHFu33bpKsG7DPnR+PWNc1tZvotgs/KzHWHaht20htWmb7kZay5uBKB6/I2M0WndEc3HXUum3Gp4gXYQNMgCml1tZ9TgM6U7fMPPuEhqn1wNbQewUQqDVa+XUsbK3PXN4HCzP0FiCYvOztq06NJof2yFX8SEgKSgkTaSogo1lW0IRgUeTuUFQVJvzyCEHbbVhffwVRenVKrpV19z0Kdz25mi1u5kGMVmdSLVsf06pD+A+xPoPSeTvnKdXQVuDVid/C/b70G6SL2jZwjmhFRERFlnIBmJBIFEwkNM8hYhHnVVX9Y0Rv7xbz6afl9eJ2endJUN/t/VNckyzPCEtlgYXDWUkc5RyqnCcYA5X48rj+LShe31fwOOAoToJfZsNzwAWN3W8uTomgKc6KIk5S8GYS0/Q/m99lczV+tLPlfDa9vzlDYvUNQ4PhWg=="
+      "orig_elements": "eJztlV1r2zAUhv+K0fWS+NvW7kpXSG+akCZXbTD6OHJMLcnI8pYu5L9PSpqtHYGxjcEydmUdnffovDoPwg87BC1IULZqOHofoIQBwSUTEPKsKPMcEwoxS5NMYJGFnKB3AZJgCSeWOP0O+UXV68EwOMQdGNn0faNVX72IHnZIau7TSRKV5X7tzjDAtOFVqxmx2hwrid14C5ONljAxg1JgJp+0eZoMqrdmYHYwwN8GsCWya2HENesnG6I41fppFHVjt7FF+73rJJoW7HPn2yPSdW3jOjp3k4+Kj3UHaitboY0kth9pIRoGrnTwExm73rwzmoG7jqplOz5l/BA2QDiYSmht3efUoDONJObZC1qi6oHU0PsJIFA1Wns7FrbWK1f3wdIMvQUIpi++fdXpoOnheOQqvickOMYl4WnKIcNZVuCkjHmRu0VBaVyyyyMEstuQvvkMvPLTqZhW1t33OLjr6dV8ebMIIrQ+I7WkPsroIf2HWJ9B6XY7t1OpQVLw04lew/1m+hXSlXJuoNbG21965Rm6tOQh5lnKOMRlGfKC8ATnOYQkZIxS+o/Rvb1bLmYfVtfL29ndJQF+4/unGSdZnpUkFUXMHVoq4jBnQHOWxDFgEV8e478Fy9d3FzwOcRglwW9xYjnEBY7crzBKS47TOCuKKEnBh0mE0/+cfpXT1fjRzleL+ez+5gdU1l8AKnj1Ng=="
    }
  },
  {
@ -383,7 +383,7 @@
        "eng"
      ],
      "page_number": 2,
-      "orig_elements": "eJxVkMtOwzAQRX8l8pompGmlhB2IIJAQldp0VarIsSdpVNtj+QFBVf4dG+iC3bzPvXO4EBAgQbl25OQuIauCccp51RXLjlf5ii0L3lddWZXlmt2uC3KTEAmOcupomL+QGLQWvWHwk2swcrR2RGXbv6HDhUjksV0UeVnOx3DDAEPDW4GMOjS/m9SdooTshBIy45UCk32iOWdeWWc8c94A/5/ARKUWsODIbHaiineI50Wu01CYyDwHUj8KcF864gnVWoyBGNRlH4qnqEFNUvRoJHV2gX0/MgirPn4kDWyuDTIIdtQgRXrtxCcIqgZPB7DRIAE1kGhLh0qrvOwgelpGvoPJRfZD+u62m9c62TwlzXOd7N9emvox2TX3Tb1Lmu1+19R1vHwV24xOAJmP32EBjeE="
+      "orig_elements": "eJxVkMtOwzAQRX8l8pomtGmlhB2IIJAQldp0VarIsSdpVNtj+QGBKv+ODe2C3bzPvbM/ExAgQblm4OQuIcucccp52eaLlpfzJVvkvCvboiyKFbtd5eQmIRIc5dTRMH8mMWgsesPgN9dg5GDtgMo2l6H9mUjksZ3n86KYDuGGAYaGNwIZdWj+Nqk7RgnZESVkxisFJvtEc8q8ss545rwB/j+BkUotYMaR2exIFW8RT7O5TkNhJNMUSN0gwH3piCdUazEEYlCXfSieogY1StGhkdTZGXbdwCCs+viRNLC5Nsgg2FG9FOm1E58gqOo97cFGgwRUT6ItHSqN8rKF6GkR+Q5GF9kP6bvbrF+rZP2U1M9Vsnt7qavHZFvf19U2qTe7bV1V8fJV7E4FpdCjGb6B1/HKdPgBKuqS2A=="
    }
  },
  {
@ -572,7 +572,7 @@
        "eng"
      ],
      "page_number": 2,
-      "orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgrCoAEVOPAkRtsfyo02F8u+1oSy689yZ8bl3DlcCHARIW7SMvEbkJcvSYZqNR1NaZiU8jwHGFDJIKWMTOp2Qp4gIsJRRS/38lYRHYdDpCm61Ai1aY1qUpvgbOlyJQBbao1E6m/Un/4eGCjUrOFbUor5vUnsOFpIzCki0kxJ08o36kjhprHaVdRrY/wI6KhSHAcPKJGcqWYl4GaQq9kJH+t6T6paD/VEBT6hSvPVE7y75kixGBbITvEYtqDUDrOu2Ar/qwkViz2ZKYwU+jmwEjx+dcAROZeNoAyYEJCAbEmIprxTSiRJCpmHgW+hsYL/FR7vL5/k+32w/o8U+Xy130eY9mkdeXS9W648o3+53+XJ5vAEenvPWciD96RdNTpBK"
+      "orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgmRRAYqceBIibI/lR5sW8e+1aVl0N+9z7+wvBDgIkLbsGHmOyFOWpcM0G4+mtMoqeBwDjClkkFLGJnQ6IQ8REWApo5b6+QsJQWnQ6RpuuQItOmM6lKb8G9pfiEAW2qNROptdj/6Ghho1KznW1KL+3aT2FCQkJxSQaCcl6OQT9Tlx0ljtaus0sP8J9FQoDgOGtUlOVLIK8TxIVewLPblePanpONgvFfCEKsU7T/Tqkg/JYlQge8Eb1IJaM8Cm6Wrwqy58JPZspjTW4O3IVvD43glP4FS2jrZggkECsiXBlvKVUjpRQfA0DHwLvQ3sl/hgd/k8L/LN9j1aFPlquYs2r9E88tX1YrV+i/JtscuXy8MNcNdcSC8YWtTdN7A8HLsefwA0I5VB"
    }
  },
  {
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.12-dev3"  # pragma: no cover
+__version__ = "0.16.12-dev4"  # pragma: no cover
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -48,7 +48,6 @@ from unstructured.partition.text_type import (
    is_bulleted_text,
    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
 )
 from unstructured.partition.utils.constants import PartitionStrategy
@ -412,15 +411,15 @@ class _DocxPartitioner:
            )
        )
-        # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
+        # -- blank paragraphs are commonly used for spacing between paragraphs and do not
-        # do not contribute to the document-element stream.
+        # -- contribute to the document-element stream
        if not text.strip():
            return
        metadata = self._paragraph_metadata(paragraph)
-        # NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a
+        # -- a list-item gets some special treatment, mutating the text to remove a
-        # bullet-character if present.
+        # -- bullet-character if present
        if self._is_list_item(paragraph):
            clean_text = clean_bullets(text).strip()
            if clean_text:
@ -431,19 +430,19 @@ class _DocxPartitioner:
                )
            return
-        # NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible
+        # -- determine element-type from an explicit Word paragraph-style if possible --
        TextSubCls = self._style_based_element_type(paragraph)
        if TextSubCls:
            yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
            return
-        # NOTE(scanny) - try to recognize the element type by parsing its text
+        # -- try to recognize the element type by parsing its text --
        TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
        if TextSubCls:
            yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
            return
-        # NOTE(scanny) - if all that fails we give it the default `Text` element-type
+        # -- if all that fails we give it the default `Text` element-type --
        yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
    def _convert_table_to_html(self, table: DocxTable) -> str:
@ -576,20 +575,20 @@ class _DocxPartitioner:
            page_break = paragraph.rendered_page_breaks[0]
-            # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break
+            # -- preceding-fragment is None when first paragraph content is a page-break --
            preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
            if preceding_paragraph_fragment:
                yield preceding_paragraph_fragment
            yield page_break
-            # NOTE(scanny) - following-fragment is None when page-break is last paragraph content.
+            # -- following-fragment is None when page-break is last paragraph content. This is
-            # This is probably quite rare (Word moves these to the start of the next paragraph) but
+            # -- probably quite rare (Word moves these to the start of the next paragraph) but
-            # easier to check for it than prove it can't happen.
+            # -- easier to check for it than prove it can't happen.
            following_paragraph_fragment = page_break.following_paragraph_fragment
-            # NOTE(scanny) - the paragraph fragment following a page-break can itself contain
+            # -- the paragraph fragment following a page-break can itself contain another
-            # another page-break. This would also be quite rare, but it can happen so we just
+            # -- page-break; this would also be quite rare, but it can happen so we just recurse
-            # recurse into the second fragment the same way we handled the original paragraph.
+            # -- into the second fragment the same way we handled the original paragraph
            if following_paragraph_fragment:
                yield from iter_paragraph_items(following_paragraph_fragment)
@ -901,8 +900,6 @@ class _DocxPartitioner:
            return EmailAddress
        if is_possible_narrative_text(text):
            return NarrativeText
        if is_possible_title(text):
            return Title
        return None
`@ -1 +1 @@`
	`__version__ = "0.16.12-dev3" # pragma: no cover`	`__version__ = "0.16.12-dev4" # pragma: no cover`