diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc30ae3e..8649d7cd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,9 @@ -## 0.6.7-dev5 - -### Enhancements - -* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents. - -## 0.6.7-dev4 +## 0.6.7-dev6 ### Enhancements * Add `file_directory` to metadata +* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents. * Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify partition strategy in CLI. For example, `--partition-strategy fast`. * Added metadata for filetype. @@ -26,6 +21,7 @@ * Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"` or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the `required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies. +* Fix to ensure `filename` is tracked in metadata for `docx` tables. ## 0.6.6 diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index e65272ce4..82505ef78 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -7,6 +7,7 @@ from unstructured.documents.elements import ( Address, ListItem, NarrativeText, + Table, Text, Title, ) @@ -97,3 +98,21 @@ def test_partition_docx_raises_with_both_specified(mock_document, tmpdir): def test_partition_docx_raises_with_neither(): with pytest.raises(ValueError): partition_docx() + + +def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"): + elements = partition_docx(filename=filename) + + assert isinstance(elements[0], Table) + assert ( + elements[0].metadata.text_as_html + == """
| Header Col 1 | Header Col 2 |
|---|---|
| Lorem ipsum | A Link example |