From 17bc55e7becc46a8d65a78ed42315870e9da221a Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Tue, 11 Jun 2024 20:39:35 -0700 Subject: [PATCH] fix: relative path / permissions issues with v2 fsspec connectors (#3186) When the v2 fsspec connectors currently generate the relative path, they may introduce a path with a leading slash (this happens in the case of the Box connector, which is a subclass of fsspec). When this happens this results in the paths unintentionally being treated as absolute paths. As a result, the ingest pipeline attempts to write files to directories at root level, which in turn raises permission issues. Note: Box expected results needed to update now that it's no longer failing. Aside: found that our tests were unintentionally skipping `box.sh` tests because we were intending to skip `dropbox.sh` and we use regex to match if a given test is in skip tests. This adds changes to force an exact match. ## Changes * Strip leading slashes during the creating of relative paths in fsspec connectors * Add expected results for Box connector * (bonus): `make tidy` altered an unrelated file by removing an unnecessary call of `pass` * (bonus): check exact match for skipped ingest tests which fixes Box tests getting skipped ## Testing [Tests](https://github.com/Unstructured-IO/unstructured/actions/runs/9461928289/job/26093475612#step:7:2085) for the Box connector was failing. It was accidentally getting skipped (see changes above). It is now no longer skipped and passing. --- CHANGELOG.md | 5 +- .../box/handbook-1p.docx.json | 331 ++++++++++++++++++ .../box/nested-1/ideas-page.html.json | 24 ++ .../nested-1/nested-2/ideas-page.html.json | 24 ++ .../box/science-exploration-1p.pptx.json | 288 +++++++++++++++ test_unstructured_ingest/test-ingest-src.sh | 20 +- unstructured/__version__.py | 2 +- .../v2/processes/connectors/fsspec/fsspec.py | 4 +- unstructured/metrics/evaluate.py | 1 - 9 files changed, 689 insertions(+), 10 deletions(-) create mode 100644 test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json create mode 100644 test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json create mode 100644 test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json create mode 100644 test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json diff --git a/CHANGELOG.md b/CHANGELOG.md index f4f2ba6a3..d1785bf9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.6-dev2 +## 0.14.6-dev3 ### Enhancements @@ -7,7 +7,8 @@ ### Fixes * **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api` -**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. +* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. +* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it. ## 0.14.5 diff --git a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json new file mode 100644 index 000000000..289d45c63 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json @@ -0,0 +1,331 @@ +[ + { + "type": "Header", + "element_id": "3cea98cfe0d578669abe2c435f9f50da", + "text": "US Trustee Handbook", + "metadata": { + "header_footer_type": "primary", + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "5209312022a75a31d95385fdccff68fa", + "text": "CHAPTER 1", + "metadata": { + "emphasized_text_contents": [ + "CHAPTER 1" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "22a23e29022f32945965002cd734a8f0", + "text": "INTRODUCTION", + "metadata": { + "emphasized_text_contents": [ + "INTRODUCTION" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "4c175cf543957acc4420221de28d3fca", + "text": "CHAPTER 1 \u2013 INTRODUCTION", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "77022a5264f552b223538977cd40f640", + "text": "A.\tPURPOSE", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "8e9d0514cc08b3b0898cd4f165d8d188", + "text": "The United States Trustee appoints and supervises standing trustees and monitors and supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the position of the United States Trustee Program (Program) on the duties owed by a standing trustee to the debtors, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but these are not considered mandatory.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "6647ac00520f9b8dcf37f1625d008a69", + "text": "Nothing in this Handbook should be construed to excuse the standing trustee from complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. \u00a7 101 et seq., unless otherwise indicated.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "60220f2162f5d83e2af6fc8d144bd429", + "text": "This Handbook does not create additional rights against the standing trustee or United States Trustee in favor of other parties.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "e341ffc123dd2827638aba18149c4175", + "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "3a6e7cf9f42299fd056a5a7a1279753a", + "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and administrative functions within the bankruptcy system were bifurcated.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "4a3de42983fb56345c598326c3732769", + "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes. ", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Title", + "element_id": "1b11ebe52652656e0ed8c12e5969de9b", + "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "5820e4e6e72ffc7a9f962983c727f9a9", + "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "3bbf318afaf932ebb9f5e9cf1b74efa2", + "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee\u2019s primary statutory duties are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties include, but are not limited to, the following:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + }, + { + "type": "Footer", + "element_id": "64a3d9e381082c0d1977ae11f4c40cf1", + "text": "Copyright", + "metadata": { + "header_footer_type": "primary", + "languages": [ + "eng" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "data_source": { + "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", + "version": "83125548004193369404829885052395764226", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://utic-test-ingest-fixtures" + }, + "date_created": "1688874451.0", + "date_modified": "1688874451.0" + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json new file mode 100644 index 000000000..e6928373b --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json @@ -0,0 +1,24 @@ +[ + { + "type": "Table", + "element_id": "32bc8af17151389d3e80f65036f8e65b", + "text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", + "metadata": { + "text_as_html": "
January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |
January 2023 ( Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. |