From 0c562d80503f6ef96504c6e38f27cfd9da8761df Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 9 Jul 2024 22:29:07 -0700 Subject: [PATCH] rfctr(auto): fix auto-partition test xfails and skips (#3367) **Summary** Improve expression in auto-partition tests and fix xfails and skips. Add issues for the two hard-fails where xfail needed to stay. --- CHANGELOG.md | 2 +- example-docs/simple.json | 127 +++ example-docs/spring-weather.html.json | 1124 ++++++++++++++++++++-- test_unstructured/partition/test_auto.py | 411 ++++---- unstructured/__version__.py | 2 +- 5 files changed, 1360 insertions(+), 306 deletions(-) create mode 100644 example-docs/simple.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 31420ba82..851fc2c5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.11-dev4 +## 0.14.11-dev5 ### Enhancements diff --git a/example-docs/simple.json b/example-docs/simple.json new file mode 100644 index 000000000..cd47b9e73 --- /dev/null +++ b/example-docs/simple.json @@ -0,0 +1,127 @@ +[ + { + "element_id": "a06d2d9e65212d4aa955c3ab32950ffa", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51" + }, + "text": "These are a few of my favorite things:", + "type": "Title" + }, + { + "element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51", + "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa" + }, + "text": "Parrots", + "type": "ListItem" + }, + { + "element_id": "76469ecb9f1459943c8d8cca1a550b5a", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51", + "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa" + }, + "text": "Hockey", + "type": "ListItem" + }, + { + "element_id": "261fac731945a138415adc2dd4434b17", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51" + }, + "text": "Analysis", + "type": "Title" + }, + { + "element_id": "95f392d32c5271bfdb30eaef45921e59", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51", + "parent_id": "261fac731945a138415adc2dd4434b17" + }, + "text": "This is my first thought. This is my second thought.", + "type": "NarrativeText" + }, + { + "element_id": "0de25bd6f0d74bc4f909f2678f385736", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51", + "parent_id": "261fac731945a138415adc2dd4434b17" + }, + "text": "This is my third thought.", + "type": "NarrativeText" + }, + { + "element_id": "f296a3bc8a901f19199fda1da92829b6", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51", + "parent_id": "261fac731945a138415adc2dd4434b17" + }, + "text": "2023", + "type": "UncategorizedText" + }, + { + "element_id": "78c62edbc674fdca0f6a0e3ffb459f86", + "metadata": { + "category_depth": 0, + "file_directory": "unstructured/example-docs", + "filename": "simple.docx", + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "last_modified": "2024-07-06T16:44:51" + }, + "text": "DOYLESTOWN, PA 18901", + "type": "Address" + } +] \ No newline at end of file diff --git a/example-docs/spring-weather.html.json b/example-docs/spring-weather.html.json index 1b3b4a980..591a796c9 100644 --- a/example-docs/spring-weather.html.json +++ b/example-docs/spring-weather.html.json @@ -1,226 +1,1178 @@ [ { - "element_id": "41f6e17bf5e9a407fcca74e902f802a0", + "type": "Title", + "element_id": "fb902c5b26b38e2d35a70a55d43a5de6", "text": "News Around NOAA", - "type": "Title", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "aa589c25dc22dcc8a75baba1244e6c8f", + "type": "Title", + "element_id": "100233c72890df3d216e2bc2c36f7153", "text": "National Program", - "type": "Title", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "62c26d2e16774d2334bd804c7bb6a711", + "type": "Title", + "element_id": "88f0bebe7a9cca77675bd8a5db823092", "text": "Are You Weather-Ready for the Spring?", - "type": "Title", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "32709cd3bec72640bbbe32f58e6e23f6", + "type": "Title", + "element_id": "568c824acda361cfc270a75e2eca7a23", "text": "Weather.gov >", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Weather.gov" + ], + "link_urls": [ + "https://www.weather.gov" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "2661da76db570876b075083aaeeaee55", + "type": "Title", + "element_id": "767e68cdb3d891322eb8b65489f53b4c", "text": "News Around NOAA > Are You Weather-Ready for the Spring?", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "News Around NOAA" + ], + "link_urls": [ + "https://www.weather.gov/news" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "fab6c4df083f0fb6f324fff65b652c86", + "type": "ListItem", + "element_id": "79fb885317b2666481d0a1c31970400d", "text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + "Weather Safety", + "Air Quality", + "Beach Hazards", + "Cold", + "Cold Water", + "Drought", + "Floods", + "Fog", + "Heat", + " Hurricanes", + " Lightning Safety", + "Rip Currents", + "Safe Boating", + "Space Weather", + "Sun (Ultraviolet Radiation)", + " Thunderstorms & Tornadoes", + "Tornado", + "Tsunami", + "Wildfire", + "Wind", + "Winter" + ], + "link_urls": [ + "http://www.weather.gov/safetycampaign", + "https://www.weather.gov/safety/airquality", + "https://www.weather.gov/safety/beachhazards", + "https://www.weather.gov/safety/cold", + "https://www.weather.gov/safety/coldwater", + "https://www.weather.gov/safety/drought", + "https://www.weather.gov/safety/flood", + "https://www.weather.gov/safety/fog", + "https://www.weather.gov/safety/heat", + "https://www.weather.gov/safety/hurricane", + "https://www.weather.gov/safety/lightning", + "https://www.weather.gov/safety/ripcurrent", + "https://www.weather.gov/safety/safeboating", + "https://www.weather.gov/safety/space", + "https://www.weather.gov/safety/heat-uv", + "https://www.weather.gov/safety/thunderstorm", + "https://www.weather.gov/safety/tornado", + "https://www.weather.gov/safety/tsunami", + "https://www.weather.gov/safety/wildfire", + "https://www.weather.gov/safety/wind", + "https://www.weather.gov/safety/winter " + ], + "link_start_indexes": [ + 0, + 14, + 25, + 38, + 42, + 52, + 59, + 65, + 68, + 72, + 83, + 100, + 112, + 124, + 137, + 164, + 190, + 197, + 204, + 212, + 216 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "45c26cf3457e6d18985a435e2c0fcc65", + "type": "ListItem", + "element_id": "512e6a00cacb0ab139ede6b0145f441d", "text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + "Safety Campaigns", + "Seasonal Safety Campaigns", + "#SafePlaceSelfie", + "Deaf & Hard of Hearing", + "Intellectual Disabilities", + "Spanish-language Content", + "The Great Outdoors" + ], + "link_urls": [ + "https://www.weather.gov/safetycampaign", + "https://www.weather.gov/safetycampaign", + "https://www.weather.gov/wrn/safeplaceselfie", + "https://www.weather.gov/wrn/dhh-safety", + "https://www.weather.gov/wrn/intellectualdisabilities", + "https://www.weather.gov/wrn/fall2020-espanol-sm", + "https://www.noaa.gov/explainers/great-outdoors-weather-safety" + ], + "link_start_indexes": [ + 0, + 16, + 41, + 57, + 79, + 104, + 128 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "77f5acc603de9a165ed87a5c3fbaf14a", - "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español", "type": "ListItem", + "element_id": "d4145282089e41261300a9bcf440edb9", + "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Espa\u00f1ol", "metadata": { - "page_number": 1 + "link_texts": [ + "Ambassador", + "About WRN Ambassadors", + "Become an Ambassador", + "Ambassadors of Excellence", + "People of WRN", + " FAQS", + "Tell Your Success Story", + " Success Stories", + "Tri-fold", + "Aviation", + " Current Ambassadors", + "Brochure", + "En Espa\u00f1ol" + ], + "link_urls": [ + "https://www.weather.gov/wrn/ambassadors", + "https://www.weather.gov/wrn/ambassadors", + "https://www.weather.gov/wrn/amb-tou", + "https://www.weather.gov/wrn/ambassador_recognition", + "https://www.weather.gov/people/", + "https://www.weather.gov/wrn/amb-faqs", + "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform", + " https://www.weather.gov/wrn/success-stories", + "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf", + "https://www.weather.gov/wrn/aviation", + " http://www.weather.gov/wrn/current-ambassadors", + "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", + "https://www.weather.gov/wrn/en-espanol" + ], + "link_start_indexes": [ + 0, + 10, + 31, + 51, + 76, + 89, + 94, + 117, + 133, + 141, + 149, + 169, + 177 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "8f19bcaabbd1bafa5e9826ac69766c8b", + "type": "ListItem", + "element_id": "aeee9b1d3904eda123d21c851ce4747d", "text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + "Education", + "NWS Education Home", + "Be A Force Of Nature", + "WRN Kids Flyer", + "Wireless Emergency Alerts", + "NOAA Weather Radio", + "Mobile Weather", + "Brochures", + "Hourly Weather Forecast", + "Citizen Science", + "Intellectual Disabilities" + ], + "link_urls": [ + "http://www.weather.gov/owlie/", + "http://www.weather.gov/owlie/", + "https://www.weather.gov/wrn/force", + " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf", + "https://www.weather.gov/wrn/wea", + "http://www.nws.noaa.gov/nwr/", + "https://www.weather.gov/wrn/mobile-phone", + "http://www.weather.gov/owlie/publication_brochures", + "https://www.weather.gov/wrn/hourly-weather-graph", + "http://www.weather.gov/media/wrn/citizen_science_page.pdf", + "https://www.weather.gov/wrn/intellectualdisabilities" + ], + "link_start_indexes": [ + 0, + 9, + 27, + 47, + 61, + 86, + 104, + 118, + 127, + 150, + 165 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "1245f9cf9e019713391e4ee3bac54a63", - "text": "Collaboration Get Involved Social Media WRN Ambassadors ​ Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)​ SKYWARN", "type": "ListItem", + "element_id": "752f5b846e4a24df6d62d9dc014e5aec", + "text": "Collaboration Get Involved Social Media WRN Ambassadors \u200b Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)\u200b SKYWARN", "metadata": { - "page_number": 1 + "link_texts": [ + "Collaboration", + "Get Involved ", + "Social Media", + "WRN Ambassadors \u200b", + "Enterprise Resources", + "StormReady", + "TsunamiReady", + "NWSChat (core partners only)", + "InteractiveNWS (iNWS) (core partners only)\u200b", + "SKYWARN" + ], + "link_urls": [ + "https://www.weather.gov/wrn/collaborate", + "https://www.weather.gov/wrn/get-involved", + "http://www.weather.gov/socialmedia", + "https://www.weather.gov/wrn/ambassadors", + "https://www.weather.gov/enterprise/", + "http://www.weather.gov/stormready/", + "https://www.weather.gov/tsunamiready/", + "https://nwschat.weather.gov/", + "https://inws.ncep.noaa.gov/", + "https://www.weather.gov/SKYWARN" + ], + "link_start_indexes": [ + 0, + 13, + 26, + 38, + 55, + 75, + 85, + 97, + 125, + 168 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "23dfa7f98424dbf86e00b3d500096dfa", + "type": "ListItem", + "element_id": "8729b5380b0f442c0512948bd18de66b", "text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + " News & Events", + "Latest News", + "Calendar", + "Meetings & Workshops", + "NWS Aware Newsletter" + ], + "link_urls": [ + "http://www.weather.gov/news/", + " http://www.weather.gov/news/", + "https://www.weather.gov/wrn/calendar", + " https://www.weather.gov/wrn/workshops", + "https://www.weather.gov/publications/aware" + ], + "link_start_indexes": [ + 0, + 14, + 25, + 33, + 53 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "93202df2ec7081b28b47901b5c287a5a", + "type": "ListItem", + "element_id": "ec0f9efa0e7de0d7bbf11f3b8fb2a1ca", "text": "International", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + "International" + ], + "link_urls": [ + "https://www.weather.gov/wrn/wrns" + ], + "link_start_indexes": [ + 0 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "e53d6a9c615bdf1a8d7b98a67cade488", + "type": "ListItem", + "element_id": "f17da617a620de011003a204ecf48752", "text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science", - "type": "ListItem", "metadata": { - "page_number": 1 + "link_texts": [ + "About", + "Contact Us", + " What is WRN?", + " WRN FAQ", + "WRN Brochure", + "Hazard Simplification", + "IDSS Brochure", + "Roadmap", + "Strategic Plan", + "WRN International", + "Social Science" + ], + "link_urls": [ + "https://www.weather.gov/wrn/about", + " https://www.weather.gov/wrn/contact", + "https://www.weather.gov/wrn/about", + "https://www.weather.gov/wrn/faqs", + "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", + "https://www.weather.gov/hazardsimplification/", + "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf", + "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf", + "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf", + " https://www.weather.gov/wrn/international", + "https://vlab.noaa.gov/web/nws-social-science" + ], + "link_start_indexes": [ + 0, + 5, + 15, + 28, + 36, + 48, + 69, + 82, + 89, + 103, + 120 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0", - "text": "The spring season is all about change – a rebirth both literally and figuratively. Even though the spring season doesn’t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", "type": "NarrativeText", + "element_id": "623c25f2247b125d6df5138a7c5ee153", + "text": "The spring season is all about change \u2013 a rebirth both literally and figuratively. Even though the spring season doesn\u2019t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "7184168da442c6ef28553b274bf2be8f", + "type": "NarrativeText", + "element_id": "c8c953bd87e4571df8e6486e9c467861", "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.", - "type": "NarrativeText", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "f3be9748ecd68b20d706548129baa22d", - "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”", "type": "NarrativeText", + "element_id": "b6553aef4dc61e5d31e2e28426e56f0b", + "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.", "metadata": { - "page_number": 1 + "emphasized_text_contents": [ + "First, take steps to better prepare for the seasonal hazards weather can throw at you." + ], + "emphasized_text_tags": [ + "strong" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "126c3cd201fb259cfeabc6bffc0b5473", - "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", "type": "NarrativeText", + "element_id": "ac246c4693669d08d274f628c3293a78", + "text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become \u201cweather-ready.\u201d", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "c1944fb037f3e1cb14969bc59a7dd9c2", - "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", "type": "NarrativeText", + "element_id": "d1fa2a66a4df9759bdf01f6f1ec51d8e", + "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content \u2013 everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", "metadata": { - "page_number": 1 + "emphasized_text_contents": [ + "Second, encourage others to become Weather-Ready as well." + ], + "emphasized_text_tags": [ + "strong" + ], + "link_texts": [ + "Spring Safety website" + ], + "link_urls": [ + "https://www.weather.gov/wrn/spring-safety" + ], + "link_start_indexes": [ + 167 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "fa1b939ef6159d95260bc095f58ebbc2", + "type": "NarrativeText", + "element_id": "996f1b86d1cb5a02028bd3816f5790f1", + "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring\u2019s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", + "metadata": { + "link_texts": [ + "infographics" + ], + "link_urls": [ + "https://www.weather.gov/wrn/spring-infographics" + ], + "link_start_indexes": [ + 303 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "NarrativeText", + "element_id": "90b31790a9b5fd903e6dbaea50e05f45", "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.", - "type": "NarrativeText", "metadata": { - "page_number": 1 + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "47d5d0d27a35a36d7467dfc8b6e089b3", - "text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.", - "type": "NarrativeText", + "type": "Title", + "element_id": "9dcf311a7e6225af9333100c709b7f23", + "text": "US Dept of Commerce", "metadata": { - "page_number": 1 + "link_texts": [ + "US Dept of Commerce" + ], + "link_urls": [ + "http://www.commerce.gov" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "129c678fce59acee7ac6a6fdb67b6310", + "type": "Title", + "element_id": "60711b68cb732ecb10f4c05f0f784647", + "text": "National Oceanic and Atmospheric Administration", + "metadata": { + "link_texts": [ + "National Oceanic and Atmospheric Administration" + ], + "link_urls": [ + "http://www.noaa.gov" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Title", + "element_id": "55ca4bf03b04ffacb8ea8cb528c22a6f", + "text": "National Weather Service", + "metadata": { + "link_texts": [ + "National Weather Service" + ], + "link_urls": [ + "https://www.weather.gov" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Title", + "element_id": "3ebaebb5791662dfa6d2e2b8af436f9d", + "text": "News Around NOAA", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Title", + "element_id": "ccf5cdb2984d2ac2d934010960d32aca", + "text": "1325 East West Highway", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Address", + "element_id": "64a081cb854ff90dbc668c2b334d0ae8", + "text": "Silver Spring, MD 20910", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Title", + "element_id": "6af532045e3aa6fe3764590594dc0dd7", + "text": "Comments? Questions? Please Contact Us.", + "metadata": { + "link_texts": [ + "Comments? Questions? Please Contact Us." + ], + "link_urls": [ + "https://www.weather.gov/news/contact" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } + } + }, + { + "type": "Title", + "element_id": "a63c69dcc655b1b32bc6157427e9ca8e", "text": "Disclaimer", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Disclaimer" + ], + "link_urls": [ + "https://www.weather.gov/disclaimer" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "3c96caaebd949e39d25b3ccf4133c5d8", + "type": "Title", + "element_id": "95054785187bcc0cf98cdb17c135ca1d", "text": "Information Quality", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Information Quality" + ], + "link_urls": [ + "http://www.cio.noaa.gov/services_programs/info_quality.html" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "b79cac926e0b2e347e72cc91d5174037", + "type": "Title", + "element_id": "800d660faa52732cd4d361b187bbd6e2", "text": "Help", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Help" + ], + "link_urls": [ + "https://www.weather.gov/help" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "4c4e436f9a453c776dbf011f98d932d6", + "type": "Title", + "element_id": "718284e0cdf275514b6aa8fb8976a7cc", "text": "Glossary", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Glossary" + ], + "link_urls": [ + "http://www.weather.gov/glossary" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "506ff394621596dd88138642eddfc1e4", + "type": "Title", + "element_id": "678ef3e5cd635ba851d2dfd7f6f20d0f", "text": "Privacy Policy", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Privacy Policy" + ], + "link_urls": [ + "https://www.weather.gov/privacy" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "c70ae8c30a61c450d2c5148d1b6a0447", + "type": "Title", + "element_id": "f66ad83bfffccef0afe60d0aaba55b54", "text": "Freedom of Information Act (FOIA)", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "Freedom of Information Act (FOIA)" + ], + "link_urls": [ + "https://www.noaa.gov/foia-freedom-of-information-act" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "5d8c71abc527284cd463aa58f3f48098", + "type": "Title", + "element_id": "f50c4a988c7336b9d1100227fa7f03a3", "text": "About Us", - "type": "Title", "metadata": { - "page_number": 1 + "link_texts": [ + "About Us" + ], + "link_urls": [ + "https://www.weather.gov/about" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } }, { - "element_id": "a8a00c355d2fa1461d532a1088274f32", - "text": "Career Opportunities", "type": "Title", + "element_id": "a9a5f8ac29adb68999173b4e65a189bd", + "text": "Career Opportunities", "metadata": { - "page_number": 1 + "link_texts": [ + "Career Opportunities" + ], + "link_urls": [ + "https://www.weather.gov/careers" + ], + "link_start_indexes": [ + -1 + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "abfs://container1/spring-weather.html", + "version": "162215905222974206637545574128436022861", + "record_locator": { + "protocol": "abfs", + "remote_file_path": "abfs://container1/" + }, + "date_created": "1678441216.0", + "date_modified": "1678441216.0" + } } } -] +] \ No newline at end of file diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 64de49f36..6752e222f 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -5,6 +5,7 @@ from __future__ import annotations import json import os import pathlib +import sys import tempfile import warnings from importlib import import_module @@ -51,19 +52,7 @@ from unstructured.partition import auto from unstructured.partition.auto import _get_partition_with_extras, partition from unstructured.partition.common import convert_office_doc from unstructured.partition.utils.constants import PartitionStrategy -from unstructured.staging.base import elements_to_json - -DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") - -EXPECTED_EMAIL_OUTPUT = [ - NarrativeText(text="This is a test email to use for unit tests."), - Title(text="Important points:"), - ListItem(text="Roses are red"), - ListItem(text="Violets are blue"), -] - -EML_TEST_FILE = "eml/fake-email.eml" +from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json is_in_docker = os.path.exists("/.dockerenv") @@ -98,7 +87,6 @@ def test_auto_partition_csv_from_file(): # ================================================================================================ -@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], @@ -126,20 +114,30 @@ def test_auto_partition_doc_with_filename( assert elements[0].metadata.file_directory == str(tmp_path) -# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to -# determine that the file is an .doc document -@pytest.mark.xfail() -def test_auto_partition_doc_with_file( - mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path -): - docx_filename = str(tmp_path / "mock_document.docx") - doc_filename = str(tmp_path / "mock_document.doc") - mock_docx_document.save(docx_filename) - convert_office_doc(docx_filename, str(tmp_path), "doc") +@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.") +@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True) +def test_auto_partition_doc_with_file(): + # -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364 + # -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not + # -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file) + # -- and `partition()` routes the document to `partition_msg` which is where the `KeyError` + # -- comes from. + # -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we + # -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't + # -- matter much though because when we add disambiguation they'll both get it right. + with open(example_doc_path("simple.doc"), "rb") as f: + elements = partition(file=f) - with open(doc_filename, "rb") as f: - elements = partition(file=f, strategy=PartitionStrategy.HI_RES) - assert elements == expected_docx_elements + assert elements == [ + Title("These are a few of my favorite things:"), + ListItem("Parrots"), + ListItem("Hockey"), + Title("Analysis"), + NarrativeText("This is my first thought. This is my second thought."), + NarrativeText("This is my third thought."), + Text("2023"), + Address("DOYLESTOWN, PA 18901"), + ] # ================================================================================================ @@ -184,21 +182,21 @@ def expected_docx_elements(): def test_auto_partition_docx_with_filename( mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path ): - filename = str(tmp_path / "mock_document.docx") - mock_docx_document.save(filename) + file_path = str(tmp_path / "mock_document.docx") + mock_docx_document.save(file_path) - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements - assert elements[0].metadata.filename == os.path.basename(filename) + assert elements[0].metadata.filename == os.path.basename(file_path) def test_auto_partition_docx_with_file( mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path ): - filename = str(tmp_path / "mock_document.docx") - mock_docx_document.save(filename) + file_path = str(tmp_path / "mock_document.docx") + mock_docx_document.save(file_path) - with open(filename, "rb") as f: + with open(file_path, "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements @@ -246,34 +244,32 @@ def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers( # EML # ================================================================================================ +EXPECTED_EMAIL_OUTPUT = [ + NarrativeText(text="This is a test email to use for unit tests."), + Title(text="Important points:"), + ListItem(text="Roses are red"), + ListItem(text="Violets are blue"), +] + def test_auto_partition_email_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE) - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + file_path = example_doc_path("eml/fake-email.eml") + elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT - assert elements[0].metadata.filename == os.path.basename(filename) - assert elements[0].metadata.file_directory == os.path.split(filename)[0] + assert elements[0].metadata.filename == os.path.basename(file_path) + assert elements[0].metadata.file_directory == os.path.split(file_path)[0] def test_auto_partition_email_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE) - with open(filename, "rb") as f: - elements = partition(file=f, strategy=PartitionStrategy.HI_RES) - assert len(elements) > 0 - assert elements == EXPECTED_EMAIL_OUTPUT - - -def test_auto_partition_email_from_file_rb(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE) - with open(filename, "rb") as f: + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT def test_auto_partition_eml_add_signature_to_metadata(): - elements = partition(filename="example-docs/eml/signed-doc.p7s") + elements = partition(example_doc_path("eml/signed-doc.p7s")) assert len(elements) == 1 assert elements[0].text == "This is a test" assert elements[0].metadata.signature == "\n" @@ -285,15 +281,13 @@ def test_auto_partition_eml_add_signature_to_metadata(): def test_auto_partition_epub_from_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") def test_auto_partition_epub_from_file(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") - with open(filename, "rb") as f: + with open(example_doc_path("winter-sports.epub"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") @@ -309,17 +303,17 @@ def test_auto_partition_epub_from_file(): [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html") - metadata_filename = filename if pass_metadata_filename else None + file_path = example_doc_path("example-10k.html") + metadata_filename = file_path if pass_metadata_filename else None elements = partition( - filename=filename, + filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) assert len(elements) > 0 - assert elements[0].metadata.filename == os.path.basename(filename) - assert elements[0].metadata.file_directory == os.path.split(filename)[0] + assert elements[0].metadata.filename == os.path.basename(file_path) + assert elements[0].metadata.file_directory == os.path.split(file_path)[0] @pytest.mark.parametrize( @@ -327,9 +321,9 @@ def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") - metadata_filename = filename if pass_metadata_filename else None - with open(filename, "rb") as f: + file_path = example_doc_path("fake-html.html") + metadata_filename = file_path if pass_metadata_filename else None + with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, @@ -340,8 +334,7 @@ def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_typ def test_auto_partition_html_from_file_rb(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") - with open(filename, "rb") as f: + with open(example_doc_path("fake-html.html"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 @@ -367,10 +360,10 @@ def test_auto_partition_html_pre_from_file(): [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_image(pass_metadata_filename: bool, content_type: str | None): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - metadata_filename = filename if pass_metadata_filename else None + file_path = example_doc_path("layout-parser-paper-fast.jpg") + metadata_filename = file_path if pass_metadata_filename else None elements = partition( - filename=filename, + filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.AUTO, @@ -405,10 +398,10 @@ def test_auto_partition_image_element_extraction(extract_image_block_to_payload: [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | None): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - metadata_filename = filename if pass_metadata_filename else None + file_path = example_doc_path("layout-parser-paper-fast.jpg") + metadata_filename = file_path if pass_metadata_filename else None elements = partition( - filename=filename, + filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.AUTO, @@ -421,9 +414,9 @@ def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | No [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpg_from_file(pass_metadata_filename: bool, content_type: str | None): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg") - metadata_filename = filename if pass_metadata_filename else None - with open(filename, "rb") as f: + file_path = example_doc_path("layout-parser-paper-fast.jpg") + metadata_filename = file_path if pass_metadata_filename else None + with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, @@ -454,19 +447,10 @@ def test_partition_image_with_bmp_with_auto(tmp_path: pathlib.Path): # ================================================================================================ -# NOTE(robinson) - skipping this test with docker image to avoid putting the -# test fixtures into the image -@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements(): """Test auto-processing an unstructured json output file by filename.""" + json_file_path = example_doc_path("spring-weather.html.json") original_file_name = "spring-weather.html" - json_file_path = ( - pathlib.Path(DIRECTORY).parents[1] - / "test_unstructured_ingest" - / "expected-structured-output" - / "azure" - / f"{original_file_name}.json" - ) with open(json_file_path) as json_f: expected_result = json.load(json_f) @@ -495,52 +479,41 @@ def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Pa # per the Unstructured ISD format text = '{"hi": "there"}' - filename = str(tmp_path / "unprocessable.json") - with open(filename, "w") as f: + file_path = str(tmp_path / "unprocessable.json") + with open(file_path, "w") as f: f.write(text) with pytest.raises(ValueError): - partition(filename=filename) + partition(filename=file_path) @pytest.mark.xfail( - reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492", + reason=( + "https://github.com/Unstructured-IO/unstructured/issues/3365" + " partition_json() does not preserve original element-id or metadata" + ), + raises=AssertionError, + strict=True, ) -def test_auto_partition_json_from_file(): - """Test auto-processing an unstructured json output file by file handle.""" - filename = os.path.join( - EXAMPLE_DOCS_DIRECTORY, - "..", - "test_unstructured_ingest", - "expected-structured-output", - "azure-blob-storage", - "spring-weather.html.json", - ) - with open(filename) as json_f: - json_data = json.load(json_f) - with open(filename, "rb") as partition_f: - json_elems = json.loads( - cast( - str, - elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES)), - ) - ) - for elem in json_elems: - # coordinates are always in the element data structures, even if None - elem.pop("coordinates") - elem.pop("coordinate_system") - assert json_data == json_elems +def test_auto_partition_json_from_file_preserves_original_elements(): + file_path = example_doc_path("simple.json") + original_elements = elements_from_json(file_path) + + with open(file_path, "rb") as f: + partitioned_elements = partition(file=f) + + assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements) def test_auto_partition_works_with_unstructured_jsons(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition( + example_doc_path("spring-weather.html.json"), strategy=PartitionStrategy.HI_RES + ) assert elements[0].text == "News Around NOAA" def test_auto_partition_works_with_unstructured_jsons_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json") - with open(filename, "rb") as f: + with open(example_doc_path("spring-weather.html.json"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert elements[0].text == "News Around NOAA" @@ -570,8 +543,7 @@ EXPECTED_MSG_OUTPUT = [ def test_auto_partition_msg_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) assert elements == EXPECTED_MSG_OUTPUT @@ -581,14 +553,12 @@ def test_auto_partition_msg_from_filename(): def test_auto_partition_odt_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition(example_doc_path("fake.odt"), strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("Lorem ipsum dolor sit amet.") def test_auto_partition_odt_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") - with open(filename, "rb") as f: + with open(example_doc_path("fake.odt"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("Lorem ipsum dolor sit amet.") @@ -623,54 +593,56 @@ def test_auto_partition_org_from_file(): ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) -def test_auto_partition_pdf_from_filename( - request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None -): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") - metadata_filename = filename if pass_metadata_filename else None +def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None): + file_path = example_doc_path("layout-parser-paper-fast.pdf") + metadata_filename = file_path if pass_metadata_filename else None elements = partition( - filename=filename, + filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) - # NOTE(alan): Xfail since new model skips the word Zejiang - request.applymarker(pytest.mark.xfail) + # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally + # (on Mac) than it does in CI. Basically the first element when partitioning locally is split + # in two when partitioning on CI. Other than that split the text is exactly the same. + idx = 2 if sys.platform == "darwin" else 3 - idx = 3 - assert isinstance(elements[idx], Title) - assert elements[idx].text.startswith("LayoutParser") + e = elements[idx] + assert isinstance(e, Title) + assert e.text.startswith("LayoutParser") + assert e.metadata.filename == os.path.basename(file_path) + assert e.metadata.file_directory == os.path.split(file_path)[0] - assert elements[idx].metadata.filename == os.path.basename(filename) - assert elements[idx].metadata.file_directory == os.path.split(filename)[0] - - idx += 1 - assert isinstance(elements[idx], NarrativeText) - assert elements[idx].text.startswith("Zejiang Shen") + e = elements[idx + 1] + assert isinstance(e, NarrativeText) + assert e.text.startswith("Zejiang Shen") def test_auto_partition_pdf_uses_table_extraction(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") with patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: - partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES) + partition( + example_doc_path("layout-parser-paper-fast.pdf"), + pdf_infer_table_structure=True, + strategy=PartitionStrategy.HI_RES, + ) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") + file_path = example_doc_path("layout-parser-paper-fast.pdf") mock_return = [NarrativeText("Hello there!")] with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition: mock_partition_with_extras_map = {"pdf": mock_partition} monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) - partition(filename=filename, strategy=PartitionStrategy.FAST) + partition(filename=file_path, strategy=PartitionStrategy.FAST) mock_partition.assert_called_once_with( - filename=filename, + filename=file_path, file=None, url=None, strategy=PartitionStrategy.FAST, @@ -692,13 +664,11 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch): ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) -def test_auto_partition_pdf_from_file( - request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None -): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") - metadata_filename = filename if pass_metadata_filename else None +def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None): + file_path = example_doc_path("layout-parser-paper-fast.pdf") + metadata_filename = file_path if pass_metadata_filename else None - with open(filename, "rb") as f: + with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, @@ -706,27 +676,28 @@ def test_auto_partition_pdf_from_file( strategy=PartitionStrategy.HI_RES, ) - # NOTE(alan): Xfail since new model skips the word Zejiang - request.applymarker(pytest.mark.xfail) + # NOTE(scanny): see "with_filename" version of this test above for more on this oddness + idx = 2 if sys.platform == "darwin" else 3 - idx = 3 - assert isinstance(elements[idx], Title) - assert elements[idx].text.startswith("LayoutParser") + e = elements[idx] + assert isinstance(e, Title) + assert e.text.startswith("LayoutParser") - idx += 1 - assert isinstance(elements[idx], NarrativeText) - assert elements[idx].text.startswith("Zejiang Shen") + e = elements[idx + 1] + assert isinstance(e, NarrativeText) + assert e.text.startswith("Zejiang Shen") def test_partition_pdf_does_not_raise_warning(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") # NOTE(robinson): This is the recommended way to check that no warning is emitted, # per the pytest docs. # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html # #additional-use-cases-of-warnings-in-tests with warnings.catch_warnings(): warnings.simplefilter("error") - partition(filename=filename, strategy=PartitionStrategy.HI_RES) + partition( + example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES + ) @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) @@ -753,11 +724,11 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_auto_partition_ppt_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + file_path = example_doc_path("fake-power-point.ppt") + elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert elements == EXPECTED_PPTX_OUTPUT - assert elements[0].metadata.filename == os.path.basename(filename) - assert elements[0].metadata.file_directory == os.path.split(filename)[0] + assert elements[0].metadata.filename == os.path.basename(file_path) + assert elements[0].metadata.file_directory == os.path.split(file_path)[0] # ================================================================================================ @@ -776,11 +747,11 @@ EXPECTED_PPTX_OUTPUT = [ def test_auto_partition_pptx_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + file_path = example_doc_path("fake-power-point.pptx") + elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert elements == EXPECTED_PPTX_OUTPUT - assert elements[0].metadata.filename == os.path.basename(filename) - assert elements[0].metadata.file_directory == os.path.split(filename)[0] + assert elements[0].metadata.filename == os.path.basename(file_path) + assert elements[0].metadata.file_directory == os.path.split(file_path)[0] @pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"]) @@ -848,8 +819,7 @@ def test_auto_partition_rst_from_file(): def test_auto_partition_rtf_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("My First Heading") @@ -883,17 +853,16 @@ EXPECTED_TEXT_OUTPUT = [ def test_auto_partition_text_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") - elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES) + file_path = example_doc_path("fake-text.txt") + elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_TEXT_OUTPUT - assert elements[0].metadata.filename == os.path.basename(filename) - assert elements[0].metadata.file_directory == os.path.split(filename)[0] + assert elements[0].metadata.filename == os.path.basename(file_path) + assert elements[0].metadata.file_directory == os.path.split(file_path)[0] def test_auto_partition_text_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") - with open(filename, "rb") as f: + with open(example_doc_path("fake-text.txt"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_TEXT_OUTPUT @@ -903,10 +872,8 @@ def test_auto_partition_text_from_file(): # XLS # ================================================================================================ - EXPECTED_XLS_TEXT_LEN = 550 - EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What" EXPECTED_XLS_TABLE = ( @@ -1054,7 +1021,7 @@ def test_auto_partition_xlsx_from_file(): def test_auto_partition_respects_starting_page_number_argument_for_xlsx(): - elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3) + elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3) assert elements[1].metadata.page_number == 3 @@ -1140,9 +1107,10 @@ def test_auto_partition_from_url_without_providing_content_type(): def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE) partition( - filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES + example_doc_path("eml/fake-email.eml"), + headers={"Accept": "application/pdf"}, + strategy=PartitionStrategy.HI_RES, ) assert caplog.records[0].levelname == "WARNING" @@ -1169,22 +1137,22 @@ def test_partition_timeout_gets_routed(): def test_add_chunking_strategy_on_partition_auto(): - filename = "example-docs/example-10k-1p.html" - elements = partition(filename) - chunk_elements = partition(filename, chunking_strategy="by_title") + file_path = example_doc_path("example-10k-1p.html") + elements = partition(file_path) + chunk_elements = partition(file_path, chunking_strategy="by_title") chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): - filename = "example-docs/example-10k-1p.html" + file_path = example_doc_path("example-10k-1p.html") # default chunk size in chars is 200 partitioned_table_elements_200_chars = [ e for e in partition( - filename, + file_path, chunking_strategy="by_title", max_characters=200, combine_text_under_n_chars=5, @@ -1195,7 +1163,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): partitioned_table_elements_5_chars = [ e for e in partition( - filename, + file_path, chunking_strategy="by_title", max_characters=5, combine_text_under_n_chars=5, @@ -1203,7 +1171,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): if isinstance(e, (Table, TableChunk)) ] - elements = partition(filename) + elements = partition(file_path) table_elements = [e for e in elements if isinstance(e, Table)] @@ -1224,12 +1192,12 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation(): - filename = "example-docs/example-10k-1p.html" + file_path = example_doc_path("example-10k-1p.html") - table_elements = [e for e in partition(filename) if isinstance(e, Table)] + table_elements = [e for e in partition(file_path) if isinstance(e, Table)] table_chunks = [ e - for e in partition(filename, chunking_strategy="by_title") + for e in partition(file_path, chunking_strategy="by_title") if isinstance(e, (Table, TableChunk)) ] @@ -1249,8 +1217,9 @@ def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation(): def test_partition_respects_detect_language_per_element_arg(): - filename = "example-docs/language-docs/eng_spa_mult.txt" - elements = partition(filename=filename, detect_language_per_element=True) + elements = partition( + example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True + ) langs = [element.metadata.languages for element in elements] assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] @@ -1288,9 +1257,10 @@ def test_partition_respects_language_arg(file_extension: str): def test_auto_with_page_breaks(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") elements = partition( - filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES + example_doc_path("layout-parser-paper-fast.pdf"), + include_page_breaks=True, + strategy=PartitionStrategy.HI_RES, ) assert "PageBreak" in [elem.category for elem in elements] @@ -1299,36 +1269,39 @@ def test_auto_with_page_breaks(): def test_auto_partition_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") - with open(filename, "rb") as f: - elements = partition(file=f, metadata_filename=filename) - assert elements[0].metadata.filename == os.path.split(filename)[-1] + file_path = example_doc_path("fake-text.txt") + with open(file_path, "rb") as f: + elements = partition(file=f, metadata_filename=file_path) + assert elements[0].metadata.filename == os.path.split(file_path)[-1] def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") - with open(filename, "rb") as f: - elements = partition(file=f, file_filename=filename) - assert elements[0].metadata.filename == os.path.split(filename)[-1] + file_path = example_doc_path("fake-text.txt") + with open(file_path, "rb") as f: + elements = partition(file=f, file_filename=file_path) + assert elements[0].metadata.filename == os.path.split(file_path)[-1] assert "WARNING" in caplog.text assert "The file_filename kwarg will be deprecated" in caplog.text def test_auto_partition_raises_with_file_and_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") - with open(filename, "rb") as f, pytest.raises(ValueError): - partition(file=f, file_filename=filename, metadata_filename=filename) + file_path = example_doc_path("fake-text.txt") + with open(file_path, "rb") as f, pytest.raises(ValueError): + partition(file=f, file_filename=file_path, metadata_filename=file_path) # -- ocr_languages -------------------------------------------------------- def test_auto_partition_formats_languages_for_tesseract(): - filename = "example-docs/chi_sim_image.jpeg" with patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_ocr: - partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"]) + partition( + example_doc_path("chi_sim_image.jpeg"), + strategy=PartitionStrategy.HI_RES, + languages=["zh"], + ) _, kwargs = mock_process_file_with_ocr.call_args_list[0] assert "ocr_languages" in kwargs assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" @@ -1338,9 +1311,8 @@ def test_auto_partition_formats_languages_for_tesseract(): def test_auto_partition_ignores_empty_string_for_ocr_languages( languages: list[str], ocr_languages: str ): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt") elements = partition( - filename=filename, + example_doc_path("book-war-and-peace-1p.txt"), strategy=PartitionStrategy.OCR_ONLY, ocr_languages=ocr_languages, languages=languages, @@ -1349,8 +1321,9 @@ def test_auto_partition_ignores_empty_string_for_ocr_languages( def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture): - filename = "example-docs/chevron-page.pdf" - partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng") + partition( + example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng" + ) assert "The ocr_languages kwarg will be deprecated" in caplog.text @@ -1463,7 +1436,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType): fun_name = "partition_" + filetype_module module = import_module(f"unstructured.partition.{filetype_module}") fun = getattr(module, fun_name) - for file in pathlib.Path("example-docs").iterdir(): + for file in pathlib.Path(example_doc_path("")).iterdir(): if file.is_file() and file.suffix == f".{extension}": elements = fun(str(file)) assert all( @@ -1478,8 +1451,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType): def test_auto_partition_element_metadata_user_provided_languages(): - filename = "example-docs/chevron-page.pdf" - elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"]) + elements = partition( + example_doc_path("chevron-page.pdf"), + strategy=PartitionStrategy.OCR_ONLY, + languages=["eng"], + ) assert elements[0].metadata.languages == ["eng"] @@ -1495,8 +1471,7 @@ def test_partition_languages_incorrectly_defaults_to_English(tmp_path: pathlib.P def test_partition_languages_default_to_None(): - filename = "example-docs/handbook-1p.docx" - elements = partition(filename=filename, detect_language_per_element=True) + elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True) # PageBreak and other elements with no text will have `None` for `languages` none_langs = [element for element in elements if element.metadata.languages is None] assert none_langs[0].text == "" @@ -1508,11 +1483,11 @@ def test_partition_default_does_not_overwrite_other_defaults(): from unstructured.partition.text import partition_text # Use a document that is primarily in a language other than English - filename = "example-docs/language-docs/UDHR_first_article_all.txt" - text_elements = partition_text(filename) + file_path = example_doc_path("language-docs/UDHR_first_article_all.txt") + text_elements = partition_text(file_path) assert text_elements[0].metadata.languages != ["eng"] - auto_elements = partition(filename) + auto_elements = partition(file_path) assert auto_elements[0].metadata.languages != ["eng"] assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4e6562d61..cca525922 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.11-dev4" # pragma: no cover +__version__ = "0.14.11-dev5" # pragma: no cover