mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 17:02:31 +00:00
feat: add ndjson support (#3845)
### Description Add ndjson file type support and treat is the same as json files.
This commit is contained in:
parent
b3a2dd4755
commit
50ea6fe7fc
@ -3,6 +3,7 @@
|
||||
### Enhancements
|
||||
|
||||
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
|
||||
- **Add NDJSON file type support**
|
||||
|
||||
### Features
|
||||
|
||||
|
8
example-docs/simple.ndjson
Normal file
8
example-docs/simple.ndjson
Normal file
@ -0,0 +1,8 @@
|
||||
{"element_id": "a06d2d9e65212d4aa955c3ab32950ffa", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "These are a few of my favorite things:", "type": "Title"}
|
||||
{"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Parrots", "type": "ListItem"}
|
||||
{"element_id": "76469ecb9f1459943c8d8cca1a550b5a", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Hockey", "type": "ListItem"}
|
||||
{"element_id": "261fac731945a138415adc2dd4434b17", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "Analysis", "type": "Title"}
|
||||
{"element_id": "95f392d32c5271bfdb30eaef45921e59", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my first thought. This is my second thought.", "type": "NarrativeText"}
|
||||
{"element_id": "0de25bd6f0d74bc4f909f2678f385736", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my third thought.", "type": "NarrativeText"}
|
||||
{"element_id": "f296a3bc8a901f19199fda1da92829b6", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "2023", "type": "UncategorizedText"}
|
||||
{"element_id": "78c62edbc674fdca0f6a0e3ffb459f86", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "DOYLESTOWN, PA 18901", "type": "Address"}
|
35
example-docs/spring-weather.html.ndjson
Normal file
35
example-docs/spring-weather.html.ndjson
Normal file
@ -0,0 +1,35 @@
|
||||
{"type": "Title", "element_id": "fb902c5b26b38e2d35a70a55d43a5de6", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "100233c72890df3d216e2bc2c36f7153", "text": "National Program", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "88f0bebe7a9cca77675bd8a5db823092", "text": "Are You Weather-Ready for the Spring?", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "568c824acda361cfc270a75e2eca7a23", "text": "Weather.gov >", "metadata": {"link_texts": ["Weather.gov"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "767e68cdb3d891322eb8b65489f53b4c", "text": "News Around NOAA > Are You Weather-Ready for the Spring?", "metadata": {"link_texts": ["News Around NOAA"], "link_urls": ["https://www.weather.gov/news"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "79fb885317b2666481d0a1c31970400d", "text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter", "metadata": {"link_texts": ["Weather Safety", "Air Quality", "Beach Hazards", "Cold", "Cold Water", "Drought", "Floods", "Fog", "Heat", " Hurricanes", " Lightning Safety", "Rip Currents", "Safe Boating", "Space Weather", "Sun (Ultraviolet Radiation)", " Thunderstorms & Tornadoes", "Tornado", "Tsunami", "Wildfire", "Wind", "Winter"], "link_urls": ["http://www.weather.gov/safetycampaign", "https://www.weather.gov/safety/airquality", "https://www.weather.gov/safety/beachhazards", "https://www.weather.gov/safety/cold", "https://www.weather.gov/safety/coldwater", "https://www.weather.gov/safety/drought", "https://www.weather.gov/safety/flood", "https://www.weather.gov/safety/fog", "https://www.weather.gov/safety/heat", "https://www.weather.gov/safety/hurricane", "https://www.weather.gov/safety/lightning", "https://www.weather.gov/safety/ripcurrent", "https://www.weather.gov/safety/safeboating", "https://www.weather.gov/safety/space", "https://www.weather.gov/safety/heat-uv", "https://www.weather.gov/safety/thunderstorm", "https://www.weather.gov/safety/tornado", "https://www.weather.gov/safety/tsunami", "https://www.weather.gov/safety/wildfire", "https://www.weather.gov/safety/wind", "https://www.weather.gov/safety/winter "], "link_start_indexes": [0, 14, 25, 38, 42, 52, 59, 65, 68, 72, 83, 100, 112, 124, 137, 164, 190, 197, 204, 212, 216], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "512e6a00cacb0ab139ede6b0145f441d", "text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors", "metadata": {"link_texts": ["Safety Campaigns", "Seasonal Safety Campaigns", "#SafePlaceSelfie", "Deaf & Hard of Hearing", "Intellectual Disabilities", "Spanish-language Content", "The Great Outdoors"], "link_urls": ["https://www.weather.gov/safetycampaign", "https://www.weather.gov/safetycampaign", "https://www.weather.gov/wrn/safeplaceselfie", "https://www.weather.gov/wrn/dhh-safety", "https://www.weather.gov/wrn/intellectualdisabilities", "https://www.weather.gov/wrn/fall2020-espanol-sm", "https://www.noaa.gov/explainers/great-outdoors-weather-safety"], "link_start_indexes": [0, 16, 41, 57, 79, 104, 128], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "d4145282089e41261300a9bcf440edb9", "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Espa\u00f1ol", "metadata": {"link_texts": ["Ambassador", "About WRN Ambassadors", "Become an Ambassador", "Ambassadors of Excellence", "People of WRN", " FAQS", "Tell Your Success Story", " Success Stories", "Tri-fold", "Aviation", " Current Ambassadors", "Brochure", "En Espa\u00f1ol"], "link_urls": ["https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/amb-tou", "https://www.weather.gov/wrn/ambassador_recognition", "https://www.weather.gov/people/", "https://www.weather.gov/wrn/amb-faqs", "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform", " https://www.weather.gov/wrn/success-stories", "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf", "https://www.weather.gov/wrn/aviation", " http://www.weather.gov/wrn/current-ambassadors", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/wrn/en-espanol"], "link_start_indexes": [0, 10, 31, 51, 76, 89, 94, 117, 133, 141, 149, 169, 177], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "aeee9b1d3904eda123d21c851ce4747d", "text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities", "metadata": {"link_texts": ["Education", "NWS Education Home", "Be A Force Of Nature", "WRN Kids Flyer", "Wireless Emergency Alerts", "NOAA Weather Radio", "Mobile Weather", "Brochures", "Hourly Weather Forecast", "Citizen Science", "Intellectual Disabilities"], "link_urls": ["http://www.weather.gov/owlie/", "http://www.weather.gov/owlie/", "https://www.weather.gov/wrn/force", " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf", "https://www.weather.gov/wrn/wea", "http://www.nws.noaa.gov/nwr/", "https://www.weather.gov/wrn/mobile-phone", "http://www.weather.gov/owlie/publication_brochures", "https://www.weather.gov/wrn/hourly-weather-graph", "http://www.weather.gov/media/wrn/citizen_science_page.pdf", "https://www.weather.gov/wrn/intellectualdisabilities"], "link_start_indexes": [0, 9, 27, 47, 61, 86, 104, 118, 127, 150, 165], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "752f5b846e4a24df6d62d9dc014e5aec", "text": "Collaboration Get Involved Social Media WRN Ambassadors \u200b Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)\u200b SKYWARN", "metadata": {"link_texts": ["Collaboration", "Get Involved ", "Social Media", "WRN Ambassadors \u200b", "Enterprise Resources", "StormReady", "TsunamiReady", "NWSChat (core partners only)", "InteractiveNWS (iNWS) (core partners only)\u200b", "SKYWARN"], "link_urls": ["https://www.weather.gov/wrn/collaborate", "https://www.weather.gov/wrn/get-involved", "http://www.weather.gov/socialmedia", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/enterprise/", "http://www.weather.gov/stormready/", "https://www.weather.gov/tsunamiready/", "https://nwschat.weather.gov/", "https://inws.ncep.noaa.gov/", "https://www.weather.gov/SKYWARN"], "link_start_indexes": [0, 13, 26, 38, 55, 75, 85, 97, 125, 168], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "8729b5380b0f442c0512948bd18de66b", "text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter", "metadata": {"link_texts": [" News & Events", "Latest News", "Calendar", "Meetings & Workshops", "NWS Aware Newsletter"], "link_urls": ["http://www.weather.gov/news/", " http://www.weather.gov/news/", "https://www.weather.gov/wrn/calendar", " https://www.weather.gov/wrn/workshops", "https://www.weather.gov/publications/aware"], "link_start_indexes": [0, 14, 25, 33, 53], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "ec0f9efa0e7de0d7bbf11f3b8fb2a1ca", "text": "International", "metadata": {"link_texts": ["International"], "link_urls": ["https://www.weather.gov/wrn/wrns"], "link_start_indexes": [0], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "ListItem", "element_id": "f17da617a620de011003a204ecf48752", "text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science", "metadata": {"link_texts": ["About", "Contact Us", " What is WRN?", " WRN FAQ", "WRN Brochure", "Hazard Simplification", "IDSS Brochure", "Roadmap", "Strategic Plan", "WRN International", "Social Science"], "link_urls": ["https://www.weather.gov/wrn/about", " https://www.weather.gov/wrn/contact", "https://www.weather.gov/wrn/about", "https://www.weather.gov/wrn/faqs", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/hazardsimplification/", "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf", "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf", "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf", " https://www.weather.gov/wrn/international", "https://vlab.noaa.gov/web/nws-social-science"], "link_start_indexes": [0, 5, 15, 28, 36, 48, 69, 82, 89, 103, 120], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "623c25f2247b125d6df5138a7c5ee153", "text": "The spring season is all about change \u2013 a rebirth both literally and figuratively. Even though the spring season doesn\u2019t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "c8c953bd87e4571df8e6486e9c467861", "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "b6553aef4dc61e5d31e2e28426e56f0b", "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.", "metadata": {"emphasized_text_contents": ["First, take steps to better prepare for the seasonal hazards weather can throw at you."], "emphasized_text_tags": ["strong"], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "ac246c4693669d08d274f628c3293a78", "text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become \u201cweather-ready.\u201d", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "d1fa2a66a4df9759bdf01f6f1ec51d8e", "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content \u2013 everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", "metadata": {"emphasized_text_contents": ["Second, encourage others to become Weather-Ready as well."], "emphasized_text_tags": ["strong"], "link_texts": ["Spring Safety website"], "link_urls": ["https://www.weather.gov/wrn/spring-safety"], "link_start_indexes": [167], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "996f1b86d1cb5a02028bd3816f5790f1", "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring\u2019s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", "metadata": {"link_texts": ["infographics"], "link_urls": ["https://www.weather.gov/wrn/spring-infographics"], "link_start_indexes": [303], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "NarrativeText", "element_id": "90b31790a9b5fd903e6dbaea50e05f45", "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "9dcf311a7e6225af9333100c709b7f23", "text": "US Dept of Commerce", "metadata": {"link_texts": ["US Dept of Commerce"], "link_urls": ["http://www.commerce.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "60711b68cb732ecb10f4c05f0f784647", "text": "National Oceanic and Atmospheric Administration", "metadata": {"link_texts": ["National Oceanic and Atmospheric Administration"], "link_urls": ["http://www.noaa.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "55ca4bf03b04ffacb8ea8cb528c22a6f", "text": "National Weather Service", "metadata": {"link_texts": ["National Weather Service"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "3ebaebb5791662dfa6d2e2b8af436f9d", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "ccf5cdb2984d2ac2d934010960d32aca", "text": "1325 East West Highway", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Address", "element_id": "64a081cb854ff90dbc668c2b334d0ae8", "text": "Silver Spring, MD 20910", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "6af532045e3aa6fe3764590594dc0dd7", "text": "Comments? Questions? Please Contact Us.", "metadata": {"link_texts": ["Comments? Questions? Please Contact Us."], "link_urls": ["https://www.weather.gov/news/contact"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "a63c69dcc655b1b32bc6157427e9ca8e", "text": "Disclaimer", "metadata": {"link_texts": ["Disclaimer"], "link_urls": ["https://www.weather.gov/disclaimer"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "95054785187bcc0cf98cdb17c135ca1d", "text": "Information Quality", "metadata": {"link_texts": ["Information Quality"], "link_urls": ["http://www.cio.noaa.gov/services_programs/info_quality.html"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "800d660faa52732cd4d361b187bbd6e2", "text": "Help", "metadata": {"link_texts": ["Help"], "link_urls": ["https://www.weather.gov/help"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "718284e0cdf275514b6aa8fb8976a7cc", "text": "Glossary", "metadata": {"link_texts": ["Glossary"], "link_urls": ["http://www.weather.gov/glossary"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "678ef3e5cd635ba851d2dfd7f6f20d0f", "text": "Privacy Policy", "metadata": {"link_texts": ["Privacy Policy"], "link_urls": ["https://www.weather.gov/privacy"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "f66ad83bfffccef0afe60d0aaba55b54", "text": "Freedom of Information Act (FOIA)", "metadata": {"link_texts": ["Freedom of Information Act (FOIA)"], "link_urls": ["https://www.noaa.gov/foia-freedom-of-information-act"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "f50c4a988c7336b9d1100227fa7f03a3", "text": "About Us", "metadata": {"link_texts": ["About Us"], "link_urls": ["https://www.weather.gov/about"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
||||
{"type": "Title", "element_id": "a9a5f8ac29adb68999173b4e65a189bd", "text": "Career Opportunities", "metadata": {"link_texts": ["Career Opportunities"], "link_urls": ["https://www.weather.gov/careers"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
|
@ -22,3 +22,4 @@ tqdm
|
||||
psutil
|
||||
python-oxmsg
|
||||
html5lib
|
||||
ndjson
|
||||
|
@ -4,13 +4,13 @@
|
||||
#
|
||||
# pip-compile ./base.in
|
||||
#
|
||||
anyio==4.6.2.post1
|
||||
anyio==4.7.0
|
||||
# via httpx
|
||||
backoff==2.2.1
|
||||
# via -r ./base.in
|
||||
beautifulsoup4==4.12.3
|
||||
# via -r ./base.in
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
@ -28,13 +28,13 @@ click==8.1.7
|
||||
# via
|
||||
# nltk
|
||||
# python-oxmsg
|
||||
cryptography==43.0.3
|
||||
cryptography==44.0.0
|
||||
# via unstructured-client
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
# -r ./base.in
|
||||
# unstructured-client
|
||||
deepdiff==8.0.1
|
||||
deepdiff==8.1.1
|
||||
# via unstructured-client
|
||||
emoji==2.14.0
|
||||
# via -r ./base.in
|
||||
@ -46,9 +46,9 @@ h11==0.14.0
|
||||
# via httpcore
|
||||
html5lib==1.1
|
||||
# via -r ./base.in
|
||||
httpcore==1.0.6
|
||||
httpcore==1.0.7
|
||||
# via httpx
|
||||
httpx==0.27.2
|
||||
httpx==0.28.1
|
||||
# via unstructured-client
|
||||
idna==3.10
|
||||
# via
|
||||
@ -64,7 +64,7 @@ langdetect==1.0.9
|
||||
# via -r ./base.in
|
||||
lxml==5.3.0
|
||||
# via -r ./base.in
|
||||
marshmallow==3.23.0
|
||||
marshmallow==3.23.1
|
||||
# via
|
||||
# dataclasses-json
|
||||
# unstructured-client
|
||||
@ -72,6 +72,8 @@ mypy-extensions==1.0.0
|
||||
# via
|
||||
# typing-inspect
|
||||
# unstructured-client
|
||||
ndjson==0.3.1
|
||||
# via -r ./base.in
|
||||
nest-asyncio==1.6.0
|
||||
# via unstructured-client
|
||||
nltk==3.9.1
|
||||
@ -80,9 +82,9 @@ numpy==1.26.4
|
||||
# via -r ./base.in
|
||||
olefile==0.47
|
||||
# via python-oxmsg
|
||||
orderly-set==5.2.2
|
||||
orderly-set==5.2.3
|
||||
# via deepdiff
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# marshmallow
|
||||
# unstructured-client
|
||||
@ -90,7 +92,7 @@ psutil==6.1.0
|
||||
# via -r ./base.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pypdf==5.0.1
|
||||
pypdf==5.1.0
|
||||
# via unstructured-client
|
||||
python-dateutil==2.9.0.post0
|
||||
# via unstructured-client
|
||||
@ -100,9 +102,9 @@ python-magic==0.4.27
|
||||
# via -r ./base.in
|
||||
python-oxmsg==0.0.1
|
||||
# via -r ./base.in
|
||||
rapidfuzz==3.10.1
|
||||
rapidfuzz==3.11.0
|
||||
# via -r ./base.in
|
||||
regex==2024.9.11
|
||||
regex==2024.11.6
|
||||
# via nltk
|
||||
requests==2.32.3
|
||||
# via
|
||||
@ -111,19 +113,17 @@ requests==2.32.3
|
||||
# unstructured-client
|
||||
requests-toolbelt==1.0.0
|
||||
# via unstructured-client
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# html5lib
|
||||
# langdetect
|
||||
# python-dateutil
|
||||
# unstructured-client
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# via anyio
|
||||
soupsieve==2.6
|
||||
# via beautifulsoup4
|
||||
tqdm==4.66.5
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -r ./base.in
|
||||
# nltk
|
||||
@ -150,5 +150,5 @@ urllib3==1.26.20
|
||||
# unstructured-client
|
||||
webencodings==0.5.1
|
||||
# via html5lib
|
||||
wrapt==1.16.0
|
||||
wrapt==1.17.0
|
||||
# via -r ./base.in
|
||||
|
@ -17,7 +17,7 @@ distlib==0.3.9
|
||||
# via virtualenv
|
||||
filelock==3.16.1
|
||||
# via virtualenv
|
||||
identify==2.6.1
|
||||
identify==2.6.3
|
||||
# via pre-commit
|
||||
importlib-metadata==8.5.0
|
||||
# via
|
||||
@ -25,7 +25,7 @@ importlib-metadata==8.5.0
|
||||
# build
|
||||
nodeenv==1.9.1
|
||||
# via pre-commit
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
@ -46,16 +46,16 @@ pyyaml==6.0.2
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# pre-commit
|
||||
tomli==2.0.2
|
||||
tomli==2.2.1
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# build
|
||||
# pip-tools
|
||||
virtualenv==20.27.0
|
||||
virtualenv==20.28.0
|
||||
# via pre-commit
|
||||
wheel==0.44.0
|
||||
wheel==0.45.1
|
||||
# via pip-tools
|
||||
zipp==3.20.2
|
||||
zipp==3.21.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
@ -16,7 +16,7 @@ python-dateutil==2.9.0.post0
|
||||
# pandas
|
||||
pytz==2024.2
|
||||
# via pandas
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
|
@ -10,5 +10,5 @@ importlib-metadata==8.5.0
|
||||
# markdown
|
||||
markdown==3.7
|
||||
# via -r ./extra-markdown.in
|
||||
zipp==3.20.2
|
||||
zipp==3.21.0
|
||||
# via importlib-metadata
|
||||
|
@ -4,13 +4,13 @@
|
||||
#
|
||||
# pip-compile ./extra-paddleocr.in
|
||||
#
|
||||
anyio==4.6.2.post1
|
||||
anyio==4.7.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
astor==0.8.1
|
||||
# via paddlepaddle
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
@ -32,17 +32,17 @@ exceptiongroup==1.2.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
fonttools==4.54.1
|
||||
fonttools==4.55.3
|
||||
# via matplotlib
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
httpcore==1.0.6
|
||||
httpcore==1.0.7
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
httpx==0.27.2
|
||||
httpx==0.28.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# paddlepaddle
|
||||
@ -52,7 +52,7 @@ idna==3.10
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
imageio==2.36.0
|
||||
imageio==2.36.1
|
||||
# via
|
||||
# imgaug
|
||||
# scikit-image
|
||||
@ -64,7 +64,7 @@ kiwisolver==1.4.7
|
||||
# via matplotlib
|
||||
lazy-loader==0.4
|
||||
# via scikit-image
|
||||
matplotlib==3.9.2
|
||||
matplotlib==3.9.4
|
||||
# via imgaug
|
||||
networkx==3.2.1
|
||||
# via
|
||||
@ -94,7 +94,7 @@ opencv-python==4.10.0.84
|
||||
# unstructured-paddleocr
|
||||
opt-einsum==3.3.0
|
||||
# via paddlepaddle
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# lazy-loader
|
||||
@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0
|
||||
# matplotlib
|
||||
pyyaml==6.0.2
|
||||
# via unstructured-paddleocr
|
||||
rapidfuzz==3.10.1
|
||||
rapidfuzz==3.11.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
@ -147,7 +147,7 @@ shapely==2.0.6
|
||||
# via
|
||||
# imgaug
|
||||
# unstructured-paddleocr
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# imgaug
|
||||
@ -156,10 +156,9 @@ sniffio==1.3.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
tifffile==2024.8.30
|
||||
# via scikit-image
|
||||
tqdm==4.66.5
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
@ -175,5 +174,5 @@ urllib3==1.26.20
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
zipp==3.20.2
|
||||
zipp==3.21.0
|
||||
# via importlib-resources
|
||||
|
@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
cachetools==5.5.0
|
||||
# via google-auth
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -25,13 +25,13 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cryptography==43.0.3
|
||||
cryptography==44.0.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
deprecated==1.2.14
|
||||
deprecated==1.2.15
|
||||
# via pikepdf
|
||||
effdet==0.4.1
|
||||
# via -r ./extra-pdf-image.in
|
||||
@ -42,32 +42,32 @@ filelock==3.16.1
|
||||
# transformers
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
fonttools==4.54.1
|
||||
fonttools==4.55.3
|
||||
# via matplotlib
|
||||
fsspec==2024.10.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-api-core[grpc]==2.21.0
|
||||
google-api-core[grpc]==2.24.0
|
||||
# via google-cloud-vision
|
||||
google-auth==2.35.0
|
||||
google-auth==2.37.0
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
google-cloud-vision==3.8.0
|
||||
google-cloud-vision==3.9.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
googleapis-common-protos==1.65.0
|
||||
googleapis-common-protos==1.66.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.67.0
|
||||
grpcio==1.68.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio-status==1.62.3
|
||||
# via google-api-core
|
||||
huggingface-hub==0.26.1
|
||||
huggingface-hub==0.27.0
|
||||
# via
|
||||
# timm
|
||||
# tokenizers
|
||||
@ -95,7 +95,7 @@ lxml==5.3.0
|
||||
# pikepdf
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
matplotlib==3.9.2
|
||||
matplotlib==3.9.4
|
||||
# via
|
||||
# pycocotools
|
||||
# unstructured-inference
|
||||
@ -130,7 +130,7 @@ opencv-python==4.10.0.84
|
||||
# via
|
||||
# layoutparser
|
||||
# unstructured-inference
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
@ -151,9 +151,9 @@ pdfminer-six==20231228
|
||||
# pdfplumber
|
||||
pdfplumber==0.11.4
|
||||
# via layoutparser
|
||||
pi-heif==0.20.0
|
||||
pi-heif==0.21.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pikepdf==9.3.0
|
||||
pikepdf==9.4.2
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==11.0.0
|
||||
# via
|
||||
@ -165,7 +165,7 @@ pillow==11.0.0
|
||||
# pikepdf
|
||||
# torchvision
|
||||
# unstructured-pytesseract
|
||||
portalocker==2.10.1
|
||||
portalocker==3.0.0
|
||||
# via iopath
|
||||
proto-plus==1.25.0
|
||||
# via
|
||||
@ -195,7 +195,7 @@ pycparser==2.22
|
||||
# cffi
|
||||
pyparsing==3.2.0
|
||||
# via matplotlib
|
||||
pypdf==5.0.1
|
||||
pypdf==5.1.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
@ -206,7 +206,7 @@ python-dateutil==2.9.0.post0
|
||||
# -c ./base.txt
|
||||
# matplotlib
|
||||
# pandas
|
||||
python-multipart==0.0.12
|
||||
python-multipart==0.0.20
|
||||
# via unstructured-inference
|
||||
pytz==2024.2
|
||||
# via pandas
|
||||
@ -217,11 +217,11 @@ pyyaml==6.0.2
|
||||
# omegaconf
|
||||
# timm
|
||||
# transformers
|
||||
rapidfuzz==3.10.1
|
||||
rapidfuzz==3.11.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-inference
|
||||
regex==2024.9.11
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
@ -239,7 +239,7 @@ safetensors==0.4.5
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
# via layoutparser
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
@ -247,7 +247,7 @@ sympy==1.13.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==1.0.11
|
||||
timm==1.0.12
|
||||
# via
|
||||
# effdet
|
||||
# unstructured-inference
|
||||
@ -255,17 +255,17 @@ tokenizers==0.19.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.5.0
|
||||
torch==2.5.1
|
||||
# via
|
||||
# effdet
|
||||
# timm
|
||||
# torchvision
|
||||
# unstructured-inference
|
||||
torchvision==0.20.0
|
||||
torchvision==0.20.1
|
||||
# via
|
||||
# effdet
|
||||
# timm
|
||||
tqdm==4.66.5
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
@ -291,9 +291,9 @@ urllib3==1.26.20
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
wrapt==1.16.0
|
||||
wrapt==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# deprecated
|
||||
zipp==3.20.2
|
||||
zipp==3.21.0
|
||||
# via importlib-resources
|
||||
|
@ -22,7 +22,7 @@ python-dateutil==2.9.0.post0
|
||||
# pandas
|
||||
pytz==2024.2
|
||||
# via pandas
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
|
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./huggingface.in
|
||||
#
|
||||
certifi==2024.8.30
|
||||
certifi==2024.12.14
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -25,7 +25,7 @@ fsspec==2024.10.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface-hub==0.26.1
|
||||
huggingface-hub==0.27.0
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
@ -53,7 +53,7 @@ numpy==1.26.4
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
@ -62,7 +62,7 @@ pyyaml==6.0.2
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
regex==2024.9.11
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
@ -78,7 +78,7 @@ safetensors==0.4.5
|
||||
# via transformers
|
||||
sentencepiece==0.2.0
|
||||
# via -r ./huggingface.in
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# langdetect
|
||||
@ -88,9 +88,9 @@ tokenizers==0.19.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.5.0
|
||||
torch==2.5.1
|
||||
# via -r ./huggingface.in
|
||||
tqdm==4.66.5
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
|
@ -6,19 +6,25 @@
|
||||
#
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.6.2.post1
|
||||
anyio==4.7.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
appdirs==1.4.4
|
||||
# via label-studio-sdk
|
||||
attrs==24.2.0
|
||||
# via jsonschema
|
||||
argcomplete==3.5.2
|
||||
# via datamodel-code-generator
|
||||
attrs==24.3.0
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
autoflake==2.3.1
|
||||
# via -r ./test.in
|
||||
black==24.10.0
|
||||
# via -r ./test.in
|
||||
certifi==2024.8.30
|
||||
# via
|
||||
# -r ./test.in
|
||||
# datamodel-code-generator
|
||||
certifi==2024.12.14
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
@ -33,15 +39,23 @@ click==8.1.7
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# nltk
|
||||
coverage[toml]==7.6.4
|
||||
coverage[toml]==7.6.9
|
||||
# via
|
||||
# -r ./test.in
|
||||
# pytest-cov
|
||||
datamodel-code-generator==0.26.1
|
||||
# via label-studio-sdk
|
||||
dnspython==2.7.0
|
||||
# via email-validator
|
||||
email-validator==2.2.0
|
||||
# via pydantic
|
||||
exceptiongroup==1.2.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# pytest
|
||||
faker==33.1.0
|
||||
# via jsf
|
||||
flake8==7.1.1
|
||||
# via
|
||||
# -r ./test.in
|
||||
@ -50,7 +64,9 @@ flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.1
|
||||
# via -r ./test.in
|
||||
grpcio==1.67.0
|
||||
genson==1.3.0
|
||||
# via datamodel-code-generator
|
||||
grpcio==1.68.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./test.in
|
||||
@ -58,11 +74,11 @@ h11==0.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
httpcore==1.0.6
|
||||
httpcore==1.0.7
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
httpx==0.27.2
|
||||
httpx==0.28.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
@ -70,20 +86,33 @@ idna==3.10
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# email-validator
|
||||
# httpx
|
||||
# requests
|
||||
# yarl
|
||||
ijson==3.3.0
|
||||
# via label-studio-sdk
|
||||
inflect==5.6.2
|
||||
# via datamodel-code-generator
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
isort==5.13.2
|
||||
# via datamodel-code-generator
|
||||
jinja2==3.1.4
|
||||
# via datamodel-code-generator
|
||||
joblib==1.4.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
jsonschema==3.2.0
|
||||
jsf==0.11.2
|
||||
# via label-studio-sdk
|
||||
label-studio-sdk==1.0.5
|
||||
jsonschema==4.23.0
|
||||
# via
|
||||
# jsf
|
||||
# label-studio-sdk
|
||||
jsonschema-specifications==2024.10.1
|
||||
# via jsonschema
|
||||
label-studio-sdk==1.0.8
|
||||
# via -r ./test.in
|
||||
liccheck==0.9.2
|
||||
# via -r ./test.in
|
||||
@ -91,6 +120,8 @@ lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.1.0
|
||||
@ -109,11 +140,13 @@ nltk==3.9.1
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
# pandas
|
||||
packaging==24.1
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# datamodel-code-generator
|
||||
# pytest
|
||||
pandas==2.2.3
|
||||
# via label-studio-sdk
|
||||
@ -125,42 +158,49 @@ platformdirs==4.3.6
|
||||
# via black
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
propcache==0.2.0
|
||||
propcache==0.2.1
|
||||
# via yarl
|
||||
pycodestyle==2.12.1
|
||||
# via
|
||||
# flake8
|
||||
# flake8-print
|
||||
pydantic==2.9.2
|
||||
pydantic[email]==2.10.3
|
||||
# via
|
||||
# -r ./test.in
|
||||
# datamodel-code-generator
|
||||
# jsf
|
||||
# label-studio-sdk
|
||||
pydantic-core==2.23.4
|
||||
pydantic-core==2.27.1
|
||||
# via pydantic
|
||||
pyflakes==3.2.0
|
||||
# via
|
||||
# autoflake
|
||||
# flake8
|
||||
pyrsistent==0.20.0
|
||||
# via jsonschema
|
||||
pytest==8.3.3
|
||||
pytest==8.3.4
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
pytest-cov==5.0.0
|
||||
pytest-cov==6.0.0
|
||||
# via -r ./test.in
|
||||
pytest-mock==3.14.0
|
||||
# via -r ./test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# faker
|
||||
# freezegun
|
||||
# pandas
|
||||
pytz==2024.2
|
||||
# via pandas
|
||||
pyyaml==6.0.2
|
||||
# via vcrpy
|
||||
regex==2024.9.11
|
||||
# via
|
||||
# datamodel-code-generator
|
||||
# vcrpy
|
||||
referencing==0.35.1
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
@ -169,42 +209,51 @@ requests==2.32.3
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
# requests-mock
|
||||
# smart-open
|
||||
requests-mock==1.12.1
|
||||
# via label-studio-sdk
|
||||
rpds-py==0.22.3
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
rstr==3.2.2
|
||||
# via jsf
|
||||
ruff==0.8.3
|
||||
# via -r ./test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
six==1.16.0
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# jsonschema
|
||||
# python-dateutil
|
||||
smart-open[http]==7.1.0
|
||||
# via jsf
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
toml==0.10.2
|
||||
# via liccheck
|
||||
tomli==2.0.2
|
||||
# via
|
||||
# datamodel-code-generator
|
||||
# liccheck
|
||||
tomli==2.2.1
|
||||
# via
|
||||
# autoflake
|
||||
# black
|
||||
# coverage
|
||||
# mypy
|
||||
# pytest
|
||||
tqdm==4.66.5
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
types-click==7.1.8
|
||||
# via -r ./test.in
|
||||
types-markdown==3.7.0.20240822
|
||||
types-markdown==3.7.0.20241204
|
||||
# via -r ./test.in
|
||||
types-requests==2.31.0.6
|
||||
# via -r ./test.in
|
||||
types-tabulate==0.9.0.20240106
|
||||
types-tabulate==0.9.0.20241207
|
||||
# via -r ./test.in
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
@ -213,6 +262,8 @@ typing-extensions==4.12.2
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# black
|
||||
# faker
|
||||
# jsf
|
||||
# label-studio-sdk
|
||||
# multidict
|
||||
# mypy
|
||||
@ -230,14 +281,12 @@ urllib3==1.26.20
|
||||
# vcrpy
|
||||
vcrpy==6.0.2
|
||||
# via -r ./test.in
|
||||
wrapt==1.16.0
|
||||
wrapt==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# smart-open
|
||||
# vcrpy
|
||||
xmljson==0.2.1
|
||||
# via label-studio-sdk
|
||||
yarl==1.16.0
|
||||
yarl==1.18.3
|
||||
# via vcrpy
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
|
@ -90,6 +90,7 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc
|
||||
(FileType.WAV, "CantinaBand3.wav", "audio/wav"),
|
||||
(FileType.XML, "factbook.xml", "application/xml"),
|
||||
(FileType.ZIP, "simple.zip", "application/zip"),
|
||||
(FileType.NDJSON, "spring-weather.html.ndjson", "application/x-ndjson"),
|
||||
],
|
||||
)
|
||||
def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type(
|
||||
@ -147,6 +148,17 @@ def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_co
|
||||
assert file_type is expected_value
|
||||
|
||||
|
||||
def test_it_identifies_NDJSON_for_file_like_object_with_no_name_but_NDJSON_content_type():
|
||||
with open(example_doc_path("simple.ndjson"), "rb") as f:
|
||||
file = io.BytesIO(f.read())
|
||||
assert detect_filetype(file=file, content_type=FileType.NDJSON.mime_type) == FileType.NDJSON
|
||||
|
||||
|
||||
# TODO: ideally this test should pass, currently fails
|
||||
# def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
|
||||
# file_path = example_doc_path("simple.ndjson")
|
||||
# assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
|
||||
|
||||
# ================================================================================================
|
||||
# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
|
||||
# ================================================================================================
|
||||
|
299
test_unstructured/partition/test_ndjson.py
Normal file
299
test_unstructured/partition/test_ndjson.py
Normal file
@ -0,0 +1,299 @@
|
||||
"""Test-suite for `unstructured.partition.ndjson` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
from unstructured.documents.elements import CompositeElement
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.ndjson import partition_ndjson
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.xml import partition_xml
|
||||
from unstructured.staging.base import elements_to_ndjson
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
test_files = [
|
||||
"fake-text.txt",
|
||||
"fake-html.html",
|
||||
"eml/fake-email.eml",
|
||||
]
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
|
||||
chunks = partition_ndjson(
|
||||
example_doc_path("spring-weather.html.ndjson"),
|
||||
chunking_strategy="basic",
|
||||
max_characters=1500,
|
||||
)
|
||||
|
||||
assert len(chunks) == 9
|
||||
assert all(isinstance(ch, CompositeElement) for ch in chunks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_ndjson_from_filename(filename: str):
|
||||
path = example_doc_path(filename)
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
_filename = os.path.basename(filename)
|
||||
test_path = os.path.join(tmpdir, _filename + ".ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
test_elements = partition_ndjson(filename=test_path)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
assert elements[i].metadata.filename == filename.split("/")[-1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_ndjson_from_filename_with_metadata_filename(filename: str):
|
||||
path = example_doc_path(filename)
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
_filename = os.path.basename(filename)
|
||||
test_path = os.path.join(tmpdir, _filename + ".ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
test_elements = partition_ndjson(filename=test_path, metadata_filename="test")
|
||||
|
||||
assert len(test_elements) > 0
|
||||
assert len(str(test_elements[0])) > 0
|
||||
assert all(element.metadata.filename == "test" for element in test_elements)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_ndjson_from_file(filename: str):
|
||||
path = example_doc_path(filename)
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
_filename = os.path.basename(filename)
|
||||
test_path = os.path.join(tmpdir, _filename + ".ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
with open(test_path, "rb") as f:
|
||||
test_elements = partition_ndjson(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
assert elements[i].metadata.filename == filename.split("/")[-1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_ndjson_from_file_with_metadata_filename(filename: str):
|
||||
path = example_doc_path(filename)
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
_filename = os.path.basename(filename)
|
||||
test_path = os.path.join(tmpdir, _filename + ".ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
with open(test_path, "rb") as f:
|
||||
test_elements = partition_ndjson(file=f, metadata_filename="test")
|
||||
|
||||
for i in range(len(test_elements)):
|
||||
assert test_elements[i].metadata.filename == "test"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", test_files)
|
||||
def test_partition_ndjson_from_text(filename: str):
|
||||
path = example_doc_path(filename)
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
_filename = os.path.basename(filename)
|
||||
test_path = os.path.join(tmpdir, _filename + ".ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
with open(test_path) as f:
|
||||
text = f.read()
|
||||
test_elements = partition_ndjson(text=text)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert len(str(elements[0])) > 0
|
||||
assert len(elements) == len(test_elements)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i] == test_elements[i]
|
||||
assert elements[i].metadata.filename == filename.split("/")[-1]
|
||||
|
||||
|
||||
def test_partition_json_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson()
|
||||
|
||||
|
||||
def test_partition_ndjson_works_with_empty_string():
|
||||
assert partition_ndjson(text="") == []
|
||||
|
||||
|
||||
def test_partition_ndjson_works_with_empty_list():
|
||||
assert partition_ndjson(text="{}") == []
|
||||
|
||||
|
||||
def test_partition_ndjson_raises_with_too_many_specified():
|
||||
path = example_doc_path("fake-text.txt")
|
||||
elements = []
|
||||
filetype = FileType.from_extension(os.path.splitext(path)[1])
|
||||
if filetype == FileType.TXT:
|
||||
elements = partition_text(filename=path)
|
||||
if filetype == FileType.HTML:
|
||||
elements = partition_html(filename=path)
|
||||
if filetype == FileType.XML:
|
||||
elements = partition_xml(filename=path)
|
||||
if filetype == FileType.EML:
|
||||
elements = partition_email(filename=path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = os.path.join(tmpdir, "fake-text.txt.ndjson")
|
||||
elements_to_ndjson(elements, filename=test_path)
|
||||
with open(test_path, "rb") as f:
|
||||
text = f.read().decode("utf-8")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson(filename=test_path, file=f)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson(filename=test_path, text=text)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson(file=f, text=text)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson(filename=test_path, file=f, text=text)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_ndjson_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture):
|
||||
filesystem_last_modified = "2029-07-05T09:24:28"
|
||||
mocker.patch(
|
||||
"unstructured.partition.ndjson.get_last_modified_date",
|
||||
return_value=filesystem_last_modified,
|
||||
)
|
||||
|
||||
elements = partition_ndjson(example_doc_path("spring-weather.html.ndjson"))
|
||||
|
||||
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_ndjson_from_file_gets_last_modified_None():
|
||||
with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f:
|
||||
elements = partition_ndjson(file=f)
|
||||
|
||||
assert all(e.metadata.last_modified is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_ndjson_from_text_gets_last_modified_None():
|
||||
with open(example_doc_path("spring-weather.html.ndjson")) as f:
|
||||
text = f.read()
|
||||
|
||||
elements = partition_ndjson(text=text)
|
||||
|
||||
assert all(e.metadata.last_modified is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_ndjson_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
filesystem_last_modified = "2029-07-05T09:24:28"
|
||||
metadata_last_modified = "2020-07-05T09:24:28"
|
||||
mocker.patch(
|
||||
"unstructured.partition.ndjson.get_last_modified_date",
|
||||
return_value=filesystem_last_modified,
|
||||
)
|
||||
|
||||
elements = partition_ndjson(
|
||||
example_doc_path("spring-weather.html.ndjson"),
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
)
|
||||
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_ndjson_from_file_prefers_metadata_last_modified():
|
||||
metadata_last_modified = "2020-07-05T09:24:28"
|
||||
with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f:
|
||||
elements = partition_ndjson(file=f, metadata_last_modified=metadata_last_modified)
|
||||
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_ndjson_from_text_prefers_metadata_last_modified():
|
||||
metadata_last_modified = "2020-07-05T09:24:28"
|
||||
with open(example_doc_path("spring-weather.html.ndjson")) as f:
|
||||
text = f.read()
|
||||
|
||||
elements = partition_ndjson(text=text, metadata_last_modified=metadata_last_modified)
|
||||
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_json_raises_with_invalid_json():
|
||||
text = '[{"hi": "there"}]]'
|
||||
with pytest.raises(ValueError):
|
||||
partition_ndjson(text=text)
|
@ -46,7 +46,7 @@ from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
|
||||
from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common.common import add_element_metadata, exactly_one
|
||||
from unstructured.partition.common.metadata import set_element_hierarchy
|
||||
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
|
||||
@ -89,7 +89,7 @@ def detect_filetype(
|
||||
Raises:
|
||||
ValueError: when:
|
||||
- `file_path` is specified but does not correspond to a file on the
|
||||
fileesystem.
|
||||
filesystem.
|
||||
- Neither `file_path` nor `file` were specified.
|
||||
"""
|
||||
ctx = _FileTypeDetectionContext.new(
|
||||
@ -123,6 +123,27 @@ def is_json_processable(
|
||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||
|
||||
|
||||
def is_ndjson_processable(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
file_text: Optional[str] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
) -> bool:
|
||||
"""True when file looks like a JSON array of objects.
|
||||
|
||||
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
|
||||
file is JSON.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file, file_text=file_text)
|
||||
|
||||
if file_text is None:
|
||||
file_text = _FileTypeDetectionContext.new(
|
||||
file_path=filename, file=file, encoding=encoding
|
||||
).text_head
|
||||
|
||||
return re.match(DICT_PATTERN, file_text) is not None
|
||||
|
||||
|
||||
class _FileTypeDetector:
|
||||
"""Determines file type from a variety of possible inputs."""
|
||||
|
||||
|
@ -288,6 +288,15 @@ class FileType(enum.Enum):
|
||||
"application/vnd.ms-outlook",
|
||||
cast(list[str], []),
|
||||
)
|
||||
NDJSON = (
|
||||
"ndjson",
|
||||
"ndjson",
|
||||
["ndjson"],
|
||||
None,
|
||||
[".ndjson"],
|
||||
"application/x-ndjson",
|
||||
cast(list[str], []),
|
||||
)
|
||||
ODT = (
|
||||
"odt",
|
||||
"odt",
|
||||
|
@ -120,6 +120,8 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
# format for document elements
|
||||
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
|
||||
|
||||
DICT_PATTERN = r"\A\s*{?"
|
||||
|
||||
# (?s) dot all (including newline characters)
|
||||
# \{(?=.*:) opening brace and at least one colon
|
||||
# .*? any characters (non-greedy)
|
||||
|
@ -11,7 +11,11 @@ import requests
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from unstructured.documents.elements import DataSourceMetadata, Element
|
||||
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
|
||||
from unstructured.file_utils.filetype import (
|
||||
detect_filetype,
|
||||
is_json_processable,
|
||||
is_ndjson_processable,
|
||||
)
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import UnsupportedFileFormatError
|
||||
@ -244,6 +248,16 @@ def partition(
|
||||
elements = partition_json(filename=filename, file=file, **kwargs)
|
||||
return augment_metadata(elements)
|
||||
|
||||
if file_type == FileType.NDJSON:
|
||||
if not is_ndjson_processable(filename=filename, file=file):
|
||||
raise ValueError(
|
||||
"Detected an NDJSON file that does not conform to the Unstructured schema. "
|
||||
"partition_json currently only processes serialized Unstructured output.",
|
||||
)
|
||||
partition_ndjson = partitioner_loader.get(file_type)
|
||||
elements = partition_ndjson(filename=filename, file=file, **kwargs)
|
||||
return augment_metadata(elements)
|
||||
|
||||
# -- EMPTY is also a special case because while we can't determine the file type, we can be
|
||||
# -- sure it doesn't contain any elements.
|
||||
if file_type == FileType.EMPTY:
|
||||
|
85
unstructured/partition/ndjson.py
Normal file
85
unstructured/partition/ndjson.py
Normal file
@ -0,0 +1,85 @@
|
||||
"""Provides `partition_ndjson()`.
|
||||
|
||||
Note this does not partition arbitrary NDJSON. Its only use-case is to "rehydrate" unstructured
|
||||
document elements serialized to JSON, essentially the same function as `elements_from_json()`, but
|
||||
this allows a document of already-partitioned elements to be combined transparently with other
|
||||
documents in a partitioning run. It also allows multiple (low-cost) chunking runs to be performed on
|
||||
a document while only incurring partitioning cost once.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
import ndjson
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import (
|
||||
FileType,
|
||||
add_metadata_with_filetype,
|
||||
is_ndjson_processable,
|
||||
)
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.staging.base import elements_from_dicts
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.NDJSON)
|
||||
@add_chunking_strategy
|
||||
def partition_ndjson(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions serialized Unstructured output into its constituent elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
text
|
||||
The string representation of the .json document.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
"""
|
||||
if text is not None and text.strip() == "" and not file and not filename:
|
||||
return []
|
||||
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
last_modified = get_last_modified_date(filename) if filename else None
|
||||
file_text = ""
|
||||
if filename is not None:
|
||||
with open(filename, encoding="utf8") as f:
|
||||
file_text = f.read()
|
||||
|
||||
elif file is not None:
|
||||
file_content = file.read()
|
||||
file_text = file_content if isinstance(file_content, str) else file_content.decode()
|
||||
file.seek(0)
|
||||
|
||||
elif text is not None:
|
||||
file_text = str(text)
|
||||
|
||||
if not is_ndjson_processable(file_text=file_text):
|
||||
raise ValueError(
|
||||
"NDJSON cannot be partitioned. Schema does not match the Unstructured schema.",
|
||||
)
|
||||
|
||||
try:
|
||||
element_dicts = ndjson.loads(file_text)
|
||||
elements = elements_from_dicts(element_dicts)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Not a valid ndjson")
|
||||
|
||||
for element in elements:
|
||||
element.metadata.last_modified = metadata_last_modified or last_modified
|
||||
|
||||
return elements
|
@ -9,6 +9,8 @@ from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from typing import Any, Iterable, Optional, Sequence, cast
|
||||
|
||||
import ndjson
|
||||
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
TYPE_TO_TEXT_ELEMENT_MAP,
|
||||
@ -152,6 +154,29 @@ def elements_to_json(
|
||||
return json_str
|
||||
|
||||
|
||||
def elements_to_ndjson(
|
||||
elements: Iterable[Element],
|
||||
filename: Optional[str] = None,
|
||||
encoding: str = "utf-8",
|
||||
) -> str:
|
||||
"""Serialize `elements` to a JSON array.
|
||||
|
||||
Also writes the JSON to `filename` if it is provided, encoded using `encoding`.
|
||||
|
||||
The JSON is returned as a string.
|
||||
"""
|
||||
# -- serialize `elements` as a JSON array (str) --
|
||||
precision_adjusted_elements = _fix_metadata_field_precision(elements)
|
||||
element_dicts = elements_to_dicts(precision_adjusted_elements)
|
||||
ndjson_str = ndjson.dumps(element_dicts, sort_keys=True)
|
||||
|
||||
if filename is not None:
|
||||
with open(filename, "w", encoding=encoding) as f:
|
||||
f.write(ndjson_str)
|
||||
|
||||
return ndjson_str
|
||||
|
||||
|
||||
def _fix_metadata_field_precision(elements: Iterable[Element]) -> list[Element]:
|
||||
out_elements: list[Element] = []
|
||||
for element in elements:
|
||||
|
Loading…
x
Reference in New Issue
Block a user