feat: add ndjson support (#3845)

### Description
Add ndjson file type support and treat is the same as json files.
This commit is contained in:
Roman Isecke 2024-12-19 09:39:26 -05:00 committed by GitHub
parent b3a2dd4755
commit 50ea6fe7fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 670 additions and 110 deletions

View File

@ -3,6 +3,7 @@
### Enhancements
- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
- **Add NDJSON file type support**
### Features

View File

@ -0,0 +1,8 @@
{"element_id": "a06d2d9e65212d4aa955c3ab32950ffa", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "These are a few of my favorite things:", "type": "Title"}
{"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Parrots", "type": "ListItem"}
{"element_id": "76469ecb9f1459943c8d8cca1a550b5a", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Hockey", "type": "ListItem"}
{"element_id": "261fac731945a138415adc2dd4434b17", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "Analysis", "type": "Title"}
{"element_id": "95f392d32c5271bfdb30eaef45921e59", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my first thought. This is my second thought.", "type": "NarrativeText"}
{"element_id": "0de25bd6f0d74bc4f909f2678f385736", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my third thought.", "type": "NarrativeText"}
{"element_id": "f296a3bc8a901f19199fda1da92829b6", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "2023", "type": "UncategorizedText"}
{"element_id": "78c62edbc674fdca0f6a0e3ffb459f86", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "DOYLESTOWN, PA 18901", "type": "Address"}

View File

@ -0,0 +1,35 @@
{"type": "Title", "element_id": "fb902c5b26b38e2d35a70a55d43a5de6", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "100233c72890df3d216e2bc2c36f7153", "text": "National Program", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "88f0bebe7a9cca77675bd8a5db823092", "text": "Are You Weather-Ready for the Spring?", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "568c824acda361cfc270a75e2eca7a23", "text": "Weather.gov >", "metadata": {"link_texts": ["Weather.gov"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "767e68cdb3d891322eb8b65489f53b4c", "text": "News Around NOAA > Are You Weather-Ready for the Spring?", "metadata": {"link_texts": ["News Around NOAA"], "link_urls": ["https://www.weather.gov/news"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "79fb885317b2666481d0a1c31970400d", "text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter", "metadata": {"link_texts": ["Weather Safety", "Air Quality", "Beach Hazards", "Cold", "Cold Water", "Drought", "Floods", "Fog", "Heat", " Hurricanes", " Lightning Safety", "Rip Currents", "Safe Boating", "Space Weather", "Sun (Ultraviolet Radiation)", " Thunderstorms & Tornadoes", "Tornado", "Tsunami", "Wildfire", "Wind", "Winter"], "link_urls": ["http://www.weather.gov/safetycampaign", "https://www.weather.gov/safety/airquality", "https://www.weather.gov/safety/beachhazards", "https://www.weather.gov/safety/cold", "https://www.weather.gov/safety/coldwater", "https://www.weather.gov/safety/drought", "https://www.weather.gov/safety/flood", "https://www.weather.gov/safety/fog", "https://www.weather.gov/safety/heat", "https://www.weather.gov/safety/hurricane", "https://www.weather.gov/safety/lightning", "https://www.weather.gov/safety/ripcurrent", "https://www.weather.gov/safety/safeboating", "https://www.weather.gov/safety/space", "https://www.weather.gov/safety/heat-uv", "https://www.weather.gov/safety/thunderstorm", "https://www.weather.gov/safety/tornado", "https://www.weather.gov/safety/tsunami", "https://www.weather.gov/safety/wildfire", "https://www.weather.gov/safety/wind", "https://www.weather.gov/safety/winter "], "link_start_indexes": [0, 14, 25, 38, 42, 52, 59, 65, 68, 72, 83, 100, 112, 124, 137, 164, 190, 197, 204, 212, 216], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "512e6a00cacb0ab139ede6b0145f441d", "text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors", "metadata": {"link_texts": ["Safety Campaigns", "Seasonal Safety Campaigns", "#SafePlaceSelfie", "Deaf & Hard of Hearing", "Intellectual Disabilities", "Spanish-language Content", "The Great Outdoors"], "link_urls": ["https://www.weather.gov/safetycampaign", "https://www.weather.gov/safetycampaign", "https://www.weather.gov/wrn/safeplaceselfie", "https://www.weather.gov/wrn/dhh-safety", "https://www.weather.gov/wrn/intellectualdisabilities", "https://www.weather.gov/wrn/fall2020-espanol-sm", "https://www.noaa.gov/explainers/great-outdoors-weather-safety"], "link_start_indexes": [0, 16, 41, 57, 79, 104, 128], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "d4145282089e41261300a9bcf440edb9", "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Espa\u00f1ol", "metadata": {"link_texts": ["Ambassador", "About WRN Ambassadors", "Become an Ambassador", "Ambassadors of Excellence", "People of WRN", " FAQS", "Tell Your Success Story", " Success Stories", "Tri-fold", "Aviation", " Current Ambassadors", "Brochure", "En Espa\u00f1ol"], "link_urls": ["https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/amb-tou", "https://www.weather.gov/wrn/ambassador_recognition", "https://www.weather.gov/people/", "https://www.weather.gov/wrn/amb-faqs", "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform", " https://www.weather.gov/wrn/success-stories", "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf", "https://www.weather.gov/wrn/aviation", " http://www.weather.gov/wrn/current-ambassadors", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/wrn/en-espanol"], "link_start_indexes": [0, 10, 31, 51, 76, 89, 94, 117, 133, 141, 149, 169, 177], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "aeee9b1d3904eda123d21c851ce4747d", "text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities", "metadata": {"link_texts": ["Education", "NWS Education Home", "Be A Force Of Nature", "WRN Kids Flyer", "Wireless Emergency Alerts", "NOAA Weather Radio", "Mobile Weather", "Brochures", "Hourly Weather Forecast", "Citizen Science", "Intellectual Disabilities"], "link_urls": ["http://www.weather.gov/owlie/", "http://www.weather.gov/owlie/", "https://www.weather.gov/wrn/force", " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf", "https://www.weather.gov/wrn/wea", "http://www.nws.noaa.gov/nwr/", "https://www.weather.gov/wrn/mobile-phone", "http://www.weather.gov/owlie/publication_brochures", "https://www.weather.gov/wrn/hourly-weather-graph", "http://www.weather.gov/media/wrn/citizen_science_page.pdf", "https://www.weather.gov/wrn/intellectualdisabilities"], "link_start_indexes": [0, 9, 27, 47, 61, 86, 104, 118, 127, 150, 165], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "752f5b846e4a24df6d62d9dc014e5aec", "text": "Collaboration Get Involved Social Media WRN Ambassadors \u200b Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)\u200b SKYWARN", "metadata": {"link_texts": ["Collaboration", "Get Involved ", "Social Media", "WRN Ambassadors \u200b", "Enterprise Resources", "StormReady", "TsunamiReady", "NWSChat (core partners only)", "InteractiveNWS (iNWS) (core partners only)\u200b", "SKYWARN"], "link_urls": ["https://www.weather.gov/wrn/collaborate", "https://www.weather.gov/wrn/get-involved", "http://www.weather.gov/socialmedia", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/enterprise/", "http://www.weather.gov/stormready/", "https://www.weather.gov/tsunamiready/", "https://nwschat.weather.gov/", "https://inws.ncep.noaa.gov/", "https://www.weather.gov/SKYWARN"], "link_start_indexes": [0, 13, 26, 38, 55, 75, 85, 97, 125, 168], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "8729b5380b0f442c0512948bd18de66b", "text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter", "metadata": {"link_texts": [" News & Events", "Latest News", "Calendar", "Meetings & Workshops", "NWS Aware Newsletter"], "link_urls": ["http://www.weather.gov/news/", " http://www.weather.gov/news/", "https://www.weather.gov/wrn/calendar", " https://www.weather.gov/wrn/workshops", "https://www.weather.gov/publications/aware"], "link_start_indexes": [0, 14, 25, 33, 53], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "ec0f9efa0e7de0d7bbf11f3b8fb2a1ca", "text": "International", "metadata": {"link_texts": ["International"], "link_urls": ["https://www.weather.gov/wrn/wrns"], "link_start_indexes": [0], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "ListItem", "element_id": "f17da617a620de011003a204ecf48752", "text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science", "metadata": {"link_texts": ["About", "Contact Us", " What is WRN?", " WRN FAQ", "WRN Brochure", "Hazard Simplification", "IDSS Brochure", "Roadmap", "Strategic Plan", "WRN International", "Social Science"], "link_urls": ["https://www.weather.gov/wrn/about", " https://www.weather.gov/wrn/contact", "https://www.weather.gov/wrn/about", "https://www.weather.gov/wrn/faqs", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/hazardsimplification/", "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf", "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf", "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf", " https://www.weather.gov/wrn/international", "https://vlab.noaa.gov/web/nws-social-science"], "link_start_indexes": [0, 5, 15, 28, 36, 48, 69, 82, 89, 103, 120], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "623c25f2247b125d6df5138a7c5ee153", "text": "The spring season is all about change \u2013 a rebirth both literally and figuratively. Even though the spring season doesn\u2019t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "c8c953bd87e4571df8e6486e9c467861", "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "b6553aef4dc61e5d31e2e28426e56f0b", "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.", "metadata": {"emphasized_text_contents": ["First, take steps to better prepare for the seasonal hazards weather can throw at you."], "emphasized_text_tags": ["strong"], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "ac246c4693669d08d274f628c3293a78", "text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become \u201cweather-ready.\u201d", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "d1fa2a66a4df9759bdf01f6f1ec51d8e", "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content \u2013 everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", "metadata": {"emphasized_text_contents": ["Second, encourage others to become Weather-Ready as well."], "emphasized_text_tags": ["strong"], "link_texts": ["Spring Safety website"], "link_urls": ["https://www.weather.gov/wrn/spring-safety"], "link_start_indexes": [167], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "996f1b86d1cb5a02028bd3816f5790f1", "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring\u2019s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", "metadata": {"link_texts": ["infographics"], "link_urls": ["https://www.weather.gov/wrn/spring-infographics"], "link_start_indexes": [303], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "NarrativeText", "element_id": "90b31790a9b5fd903e6dbaea50e05f45", "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "9dcf311a7e6225af9333100c709b7f23", "text": "US Dept of Commerce", "metadata": {"link_texts": ["US Dept of Commerce"], "link_urls": ["http://www.commerce.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "60711b68cb732ecb10f4c05f0f784647", "text": "National Oceanic and Atmospheric Administration", "metadata": {"link_texts": ["National Oceanic and Atmospheric Administration"], "link_urls": ["http://www.noaa.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "55ca4bf03b04ffacb8ea8cb528c22a6f", "text": "National Weather Service", "metadata": {"link_texts": ["National Weather Service"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "3ebaebb5791662dfa6d2e2b8af436f9d", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "ccf5cdb2984d2ac2d934010960d32aca", "text": "1325 East West Highway", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Address", "element_id": "64a081cb854ff90dbc668c2b334d0ae8", "text": "Silver Spring, MD 20910", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "6af532045e3aa6fe3764590594dc0dd7", "text": "Comments? Questions? Please Contact Us.", "metadata": {"link_texts": ["Comments? Questions? Please Contact Us."], "link_urls": ["https://www.weather.gov/news/contact"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "a63c69dcc655b1b32bc6157427e9ca8e", "text": "Disclaimer", "metadata": {"link_texts": ["Disclaimer"], "link_urls": ["https://www.weather.gov/disclaimer"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "95054785187bcc0cf98cdb17c135ca1d", "text": "Information Quality", "metadata": {"link_texts": ["Information Quality"], "link_urls": ["http://www.cio.noaa.gov/services_programs/info_quality.html"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "800d660faa52732cd4d361b187bbd6e2", "text": "Help", "metadata": {"link_texts": ["Help"], "link_urls": ["https://www.weather.gov/help"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "718284e0cdf275514b6aa8fb8976a7cc", "text": "Glossary", "metadata": {"link_texts": ["Glossary"], "link_urls": ["http://www.weather.gov/glossary"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "678ef3e5cd635ba851d2dfd7f6f20d0f", "text": "Privacy Policy", "metadata": {"link_texts": ["Privacy Policy"], "link_urls": ["https://www.weather.gov/privacy"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "f66ad83bfffccef0afe60d0aaba55b54", "text": "Freedom of Information Act (FOIA)", "metadata": {"link_texts": ["Freedom of Information Act (FOIA)"], "link_urls": ["https://www.noaa.gov/foia-freedom-of-information-act"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "f50c4a988c7336b9d1100227fa7f03a3", "text": "About Us", "metadata": {"link_texts": ["About Us"], "link_urls": ["https://www.weather.gov/about"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}
{"type": "Title", "element_id": "a9a5f8ac29adb68999173b4e65a189bd", "text": "Career Opportunities", "metadata": {"link_texts": ["Career Opportunities"], "link_urls": ["https://www.weather.gov/careers"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}}

View File

@ -22,3 +22,4 @@ tqdm
psutil
python-oxmsg
html5lib
ndjson

View File

@ -4,13 +4,13 @@
#
# pip-compile ./base.in
#
anyio==4.6.2.post1
anyio==4.7.0
# via httpx
backoff==2.2.1
# via -r ./base.in
beautifulsoup4==4.12.3
# via -r ./base.in
certifi==2024.8.30
certifi==2024.12.14
# via
# httpcore
# httpx
@ -28,13 +28,13 @@ click==8.1.7
# via
# nltk
# python-oxmsg
cryptography==43.0.3
cryptography==44.0.0
# via unstructured-client
dataclasses-json==0.6.7
# via
# -r ./base.in
# unstructured-client
deepdiff==8.0.1
deepdiff==8.1.1
# via unstructured-client
emoji==2.14.0
# via -r ./base.in
@ -46,9 +46,9 @@ h11==0.14.0
# via httpcore
html5lib==1.1
# via -r ./base.in
httpcore==1.0.6
httpcore==1.0.7
# via httpx
httpx==0.27.2
httpx==0.28.1
# via unstructured-client
idna==3.10
# via
@ -64,7 +64,7 @@ langdetect==1.0.9
# via -r ./base.in
lxml==5.3.0
# via -r ./base.in
marshmallow==3.23.0
marshmallow==3.23.1
# via
# dataclasses-json
# unstructured-client
@ -72,6 +72,8 @@ mypy-extensions==1.0.0
# via
# typing-inspect
# unstructured-client
ndjson==0.3.1
# via -r ./base.in
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.9.1
@ -80,9 +82,9 @@ numpy==1.26.4
# via -r ./base.in
olefile==0.47
# via python-oxmsg
orderly-set==5.2.2
orderly-set==5.2.3
# via deepdiff
packaging==24.1
packaging==24.2
# via
# marshmallow
# unstructured-client
@ -90,7 +92,7 @@ psutil==6.1.0
# via -r ./base.in
pycparser==2.22
# via cffi
pypdf==5.0.1
pypdf==5.1.0
# via unstructured-client
python-dateutil==2.9.0.post0
# via unstructured-client
@ -100,9 +102,9 @@ python-magic==0.4.27
# via -r ./base.in
python-oxmsg==0.0.1
# via -r ./base.in
rapidfuzz==3.10.1
rapidfuzz==3.11.0
# via -r ./base.in
regex==2024.9.11
regex==2024.11.6
# via nltk
requests==2.32.3
# via
@ -111,19 +113,17 @@ requests==2.32.3
# unstructured-client
requests-toolbelt==1.0.0
# via unstructured-client
six==1.16.0
six==1.17.0
# via
# html5lib
# langdetect
# python-dateutil
# unstructured-client
sniffio==1.3.1
# via
# anyio
# httpx
# via anyio
soupsieve==2.6
# via beautifulsoup4
tqdm==4.66.5
tqdm==4.67.1
# via
# -r ./base.in
# nltk
@ -150,5 +150,5 @@ urllib3==1.26.20
# unstructured-client
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
wrapt==1.17.0
# via -r ./base.in

View File

@ -17,7 +17,7 @@ distlib==0.3.9
# via virtualenv
filelock==3.16.1
# via virtualenv
identify==2.6.1
identify==2.6.3
# via pre-commit
importlib-metadata==8.5.0
# via
@ -25,7 +25,7 @@ importlib-metadata==8.5.0
# build
nodeenv==1.9.1
# via pre-commit
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# -c ./test.txt
@ -46,16 +46,16 @@ pyyaml==6.0.2
# via
# -c ./test.txt
# pre-commit
tomli==2.0.2
tomli==2.2.1
# via
# -c ./test.txt
# build
# pip-tools
virtualenv==20.27.0
virtualenv==20.28.0
# via pre-commit
wheel==0.44.0
wheel==0.45.1
# via pip-tools
zipp==3.20.2
zipp==3.21.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:

View File

@ -16,7 +16,7 @@ python-dateutil==2.9.0.post0
# pandas
pytz==2024.2
# via pandas
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# python-dateutil

View File

@ -10,5 +10,5 @@ importlib-metadata==8.5.0
# markdown
markdown==3.7
# via -r ./extra-markdown.in
zipp==3.20.2
zipp==3.21.0
# via importlib-metadata

View File

@ -4,13 +4,13 @@
#
# pip-compile ./extra-paddleocr.in
#
anyio==4.6.2.post1
anyio==4.7.0
# via
# -c ./base.txt
# httpx
astor==0.8.1
# via paddlepaddle
certifi==2024.8.30
certifi==2024.12.14
# via
# -c ./base.txt
# httpcore
@ -32,17 +32,17 @@ exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
fonttools==4.54.1
fonttools==4.55.3
# via matplotlib
h11==0.14.0
# via
# -c ./base.txt
# httpcore
httpcore==1.0.6
httpcore==1.0.7
# via
# -c ./base.txt
# httpx
httpx==0.27.2
httpx==0.28.1
# via
# -c ./base.txt
# paddlepaddle
@ -52,7 +52,7 @@ idna==3.10
# anyio
# httpx
# requests
imageio==2.36.0
imageio==2.36.1
# via
# imgaug
# scikit-image
@ -64,7 +64,7 @@ kiwisolver==1.4.7
# via matplotlib
lazy-loader==0.4
# via scikit-image
matplotlib==3.9.2
matplotlib==3.9.4
# via imgaug
networkx==3.2.1
# via
@ -94,7 +94,7 @@ opencv-python==4.10.0.84
# unstructured-paddleocr
opt-einsum==3.3.0
# via paddlepaddle
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# lazy-loader
@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0
# matplotlib
pyyaml==6.0.2
# via unstructured-paddleocr
rapidfuzz==3.10.1
rapidfuzz==3.11.0
# via
# -c ./base.txt
# unstructured-paddleocr
@ -147,7 +147,7 @@ shapely==2.0.6
# via
# imgaug
# unstructured-paddleocr
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# imgaug
@ -156,10 +156,9 @@ sniffio==1.3.1
# via
# -c ./base.txt
# anyio
# httpx
tifffile==2024.8.30
# via scikit-image
tqdm==4.66.5
tqdm==4.67.1
# via
# -c ./base.txt
# unstructured-paddleocr
@ -175,5 +174,5 @@ urllib3==1.26.20
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
zipp==3.20.2
zipp==3.21.0
# via importlib-resources

View File

@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3
# via omegaconf
cachetools==5.5.0
# via google-auth
certifi==2024.8.30
certifi==2024.12.14
# via
# -c ./base.txt
# requests
@ -25,13 +25,13 @@ coloredlogs==15.0.1
# via onnxruntime
contourpy==1.3.0
# via matplotlib
cryptography==43.0.3
cryptography==44.0.0
# via
# -c ./base.txt
# pdfminer-six
cycler==0.12.1
# via matplotlib
deprecated==1.2.14
deprecated==1.2.15
# via pikepdf
effdet==0.4.1
# via -r ./extra-pdf-image.in
@ -42,32 +42,32 @@ filelock==3.16.1
# transformers
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.54.1
fonttools==4.55.3
# via matplotlib
fsspec==2024.10.0
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.21.0
google-api-core[grpc]==2.24.0
# via google-cloud-vision
google-auth==2.35.0
google-auth==2.37.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==3.8.0
google-cloud-vision==3.9.0
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.65.0
googleapis-common-protos==1.66.0
# via
# google-api-core
# grpcio-status
grpcio==1.67.0
grpcio==1.68.1
# via
# -c ././deps/constraints.txt
# google-api-core
# grpcio-status
grpcio-status==1.62.3
# via google-api-core
huggingface-hub==0.26.1
huggingface-hub==0.27.0
# via
# timm
# tokenizers
@ -95,7 +95,7 @@ lxml==5.3.0
# pikepdf
markupsafe==3.0.2
# via jinja2
matplotlib==3.9.2
matplotlib==3.9.4
# via
# pycocotools
# unstructured-inference
@ -130,7 +130,7 @@ opencv-python==4.10.0.84
# via
# layoutparser
# unstructured-inference
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# huggingface-hub
@ -151,9 +151,9 @@ pdfminer-six==20231228
# pdfplumber
pdfplumber==0.11.4
# via layoutparser
pi-heif==0.20.0
pi-heif==0.21.0
# via -r ./extra-pdf-image.in
pikepdf==9.3.0
pikepdf==9.4.2
# via -r ./extra-pdf-image.in
pillow==11.0.0
# via
@ -165,7 +165,7 @@ pillow==11.0.0
# pikepdf
# torchvision
# unstructured-pytesseract
portalocker==2.10.1
portalocker==3.0.0
# via iopath
proto-plus==1.25.0
# via
@ -195,7 +195,7 @@ pycparser==2.22
# cffi
pyparsing==3.2.0
# via matplotlib
pypdf==5.0.1
pypdf==5.1.0
# via
# -c ./base.txt
# -r ./extra-pdf-image.in
@ -206,7 +206,7 @@ python-dateutil==2.9.0.post0
# -c ./base.txt
# matplotlib
# pandas
python-multipart==0.0.12
python-multipart==0.0.20
# via unstructured-inference
pytz==2024.2
# via pandas
@ -217,11 +217,11 @@ pyyaml==6.0.2
# omegaconf
# timm
# transformers
rapidfuzz==3.10.1
rapidfuzz==3.11.0
# via
# -c ./base.txt
# unstructured-inference
regex==2024.9.11
regex==2024.11.6
# via
# -c ./base.txt
# transformers
@ -239,7 +239,7 @@ safetensors==0.4.5
# transformers
scipy==1.13.1
# via layoutparser
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# python-dateutil
@ -247,7 +247,7 @@ sympy==1.13.1
# via
# onnxruntime
# torch
timm==1.0.11
timm==1.0.12
# via
# effdet
# unstructured-inference
@ -255,17 +255,17 @@ tokenizers==0.19.1
# via
# -c ././deps/constraints.txt
# transformers
torch==2.5.0
torch==2.5.1
# via
# effdet
# timm
# torchvision
# unstructured-inference
torchvision==0.20.0
torchvision==0.20.1
# via
# effdet
# timm
tqdm==4.66.5
tqdm==4.67.1
# via
# -c ./base.txt
# huggingface-hub
@ -291,9 +291,9 @@ urllib3==1.26.20
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
wrapt==1.16.0
wrapt==1.17.0
# via
# -c ./base.txt
# deprecated
zipp==3.20.2
zipp==3.21.0
# via importlib-resources

View File

@ -22,7 +22,7 @@ python-dateutil==2.9.0.post0
# pandas
pytz==2024.2
# via pandas
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# python-dateutil

View File

@ -4,7 +4,7 @@
#
# pip-compile ./huggingface.in
#
certifi==2024.8.30
certifi==2024.12.14
# via
# -c ./base.txt
# requests
@ -25,7 +25,7 @@ fsspec==2024.10.0
# via
# huggingface-hub
# torch
huggingface-hub==0.26.1
huggingface-hub==0.27.0
# via
# tokenizers
# transformers
@ -53,7 +53,7 @@ numpy==1.26.4
# via
# -c ./base.txt
# transformers
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# huggingface-hub
@ -62,7 +62,7 @@ pyyaml==6.0.2
# via
# huggingface-hub
# transformers
regex==2024.9.11
regex==2024.11.6
# via
# -c ./base.txt
# sacremoses
@ -78,7 +78,7 @@ safetensors==0.4.5
# via transformers
sentencepiece==0.2.0
# via -r ./huggingface.in
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# langdetect
@ -88,9 +88,9 @@ tokenizers==0.19.1
# via
# -c ././deps/constraints.txt
# transformers
torch==2.5.0
torch==2.5.1
# via -r ./huggingface.in
tqdm==4.66.5
tqdm==4.67.1
# via
# -c ./base.txt
# huggingface-hub

View File

@ -6,19 +6,25 @@
#
annotated-types==0.7.0
# via pydantic
anyio==4.6.2.post1
anyio==4.7.0
# via
# -c ./base.txt
# httpx
appdirs==1.4.4
# via label-studio-sdk
attrs==24.2.0
# via jsonschema
argcomplete==3.5.2
# via datamodel-code-generator
attrs==24.3.0
# via
# jsonschema
# referencing
autoflake==2.3.1
# via -r ./test.in
black==24.10.0
# via -r ./test.in
certifi==2024.8.30
# via
# -r ./test.in
# datamodel-code-generator
certifi==2024.12.14
# via
# -c ./base.txt
# httpcore
@ -33,15 +39,23 @@ click==8.1.7
# -c ./base.txt
# black
# nltk
coverage[toml]==7.6.4
coverage[toml]==7.6.9
# via
# -r ./test.in
# pytest-cov
datamodel-code-generator==0.26.1
# via label-studio-sdk
dnspython==2.7.0
# via email-validator
email-validator==2.2.0
# via pydantic
exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
# pytest
faker==33.1.0
# via jsf
flake8==7.1.1
# via
# -r ./test.in
@ -50,7 +64,9 @@ flake8-print==5.0.0
# via -r ./test.in
freezegun==1.5.1
# via -r ./test.in
grpcio==1.67.0
genson==1.3.0
# via datamodel-code-generator
grpcio==1.68.1
# via
# -c ././deps/constraints.txt
# -r ./test.in
@ -58,11 +74,11 @@ h11==0.14.0
# via
# -c ./base.txt
# httpcore
httpcore==1.0.6
httpcore==1.0.7
# via
# -c ./base.txt
# httpx
httpx==0.27.2
httpx==0.28.1
# via
# -c ./base.txt
# label-studio-sdk
@ -70,20 +86,33 @@ idna==3.10
# via
# -c ./base.txt
# anyio
# email-validator
# httpx
# requests
# yarl
ijson==3.3.0
# via label-studio-sdk
inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
isort==5.13.2
# via datamodel-code-generator
jinja2==3.1.4
# via datamodel-code-generator
joblib==1.4.2
# via
# -c ./base.txt
# nltk
jsonschema==3.2.0
jsf==0.11.2
# via label-studio-sdk
label-studio-sdk==1.0.5
jsonschema==4.23.0
# via
# jsf
# label-studio-sdk
jsonschema-specifications==2024.10.1
# via jsonschema
label-studio-sdk==1.0.8
# via -r ./test.in
liccheck==0.9.2
# via -r ./test.in
@ -91,6 +120,8 @@ lxml==5.3.0
# via
# -c ./base.txt
# label-studio-sdk
markupsafe==3.0.2
# via jinja2
mccabe==0.7.0
# via flake8
multidict==6.1.0
@ -109,11 +140,13 @@ nltk==3.9.1
numpy==1.26.4
# via
# -c ./base.txt
# label-studio-sdk
# pandas
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# black
# datamodel-code-generator
# pytest
pandas==2.2.3
# via label-studio-sdk
@ -125,42 +158,49 @@ platformdirs==4.3.6
# via black
pluggy==1.5.0
# via pytest
propcache==0.2.0
propcache==0.2.1
# via yarl
pycodestyle==2.12.1
# via
# flake8
# flake8-print
pydantic==2.9.2
pydantic[email]==2.10.3
# via
# -r ./test.in
# datamodel-code-generator
# jsf
# label-studio-sdk
pydantic-core==2.23.4
pydantic-core==2.27.1
# via pydantic
pyflakes==3.2.0
# via
# autoflake
# flake8
pyrsistent==0.20.0
# via jsonschema
pytest==8.3.3
pytest==8.3.4
# via
# pytest-cov
# pytest-mock
pytest-cov==5.0.0
pytest-cov==6.0.0
# via -r ./test.in
pytest-mock==3.14.0
# via -r ./test.in
python-dateutil==2.9.0.post0
# via
# -c ./base.txt
# faker
# freezegun
# pandas
pytz==2024.2
# via pandas
pyyaml==6.0.2
# via vcrpy
regex==2024.9.11
# via
# datamodel-code-generator
# vcrpy
referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.11.6
# via
# -c ./base.txt
# nltk
@ -169,42 +209,51 @@ requests==2.32.3
# -c ./base.txt
# label-studio-sdk
# requests-mock
# smart-open
requests-mock==1.12.1
# via label-studio-sdk
rpds-py==0.22.3
# via
# jsonschema
# referencing
rstr==3.2.2
# via jsf
ruff==0.8.3
# via -r ./test.in
semantic-version==2.10.0
# via liccheck
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# jsonschema
# python-dateutil
smart-open[http]==7.1.0
# via jsf
sniffio==1.3.1
# via
# -c ./base.txt
# anyio
# httpx
toml==0.10.2
# via liccheck
tomli==2.0.2
# via
# datamodel-code-generator
# liccheck
tomli==2.2.1
# via
# autoflake
# black
# coverage
# mypy
# pytest
tqdm==4.66.5
tqdm==4.67.1
# via
# -c ./base.txt
# nltk
types-click==7.1.8
# via -r ./test.in
types-markdown==3.7.0.20240822
types-markdown==3.7.0.20241204
# via -r ./test.in
types-requests==2.31.0.6
# via -r ./test.in
types-tabulate==0.9.0.20240106
types-tabulate==0.9.0.20241207
# via -r ./test.in
types-urllib3==1.26.25.14
# via types-requests
@ -213,6 +262,8 @@ typing-extensions==4.12.2
# -c ./base.txt
# anyio
# black
# faker
# jsf
# label-studio-sdk
# multidict
# mypy
@ -230,14 +281,12 @@ urllib3==1.26.20
# vcrpy
vcrpy==6.0.2
# via -r ./test.in
wrapt==1.16.0
wrapt==1.17.0
# via
# -c ./base.txt
# smart-open
# vcrpy
xmljson==0.2.1
# via label-studio-sdk
yarl==1.16.0
yarl==1.18.3
# via vcrpy
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -90,6 +90,7 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc
(FileType.WAV, "CantinaBand3.wav", "audio/wav"),
(FileType.XML, "factbook.xml", "application/xml"),
(FileType.ZIP, "simple.zip", "application/zip"),
(FileType.NDJSON, "spring-weather.html.ndjson", "application/x-ndjson"),
],
)
def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type(
@ -147,6 +148,17 @@ def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_co
assert file_type is expected_value
def test_it_identifies_NDJSON_for_file_like_object_with_no_name_but_NDJSON_content_type():
with open(example_doc_path("simple.ndjson"), "rb") as f:
file = io.BytesIO(f.read())
assert detect_filetype(file=file, content_type=FileType.NDJSON.mime_type) == FileType.NDJSON
# TODO: ideally this test should pass, currently fails
# def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
# file_path = example_doc_path("simple.ndjson")
# assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
# ================================================================================================
# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
# ================================================================================================

View File

@ -0,0 +1,299 @@
"""Test-suite for `unstructured.partition.ndjson` module."""
from __future__ import annotations
import os
import pathlib
import tempfile
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import CompositeElement
from unstructured.file_utils.model import FileType
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.ndjson import partition_ndjson
from unstructured.partition.text import partition_text
from unstructured.partition.xml import partition_xml
from unstructured.staging.base import elements_to_ndjson
DIRECTORY = pathlib.Path(__file__).parent.resolve()
is_in_docker = os.path.exists("/.dockerenv")
test_files = [
"fake-text.txt",
"fake-html.html",
"eml/fake-email.eml",
]
is_in_docker = os.path.exists("/.dockerenv")
def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
chunks = partition_ndjson(
example_doc_path("spring-weather.html.ndjson"),
chunking_strategy="basic",
max_characters=1500,
)
assert len(chunks) == 9
assert all(isinstance(ch, CompositeElement) for ch in chunks)
@pytest.mark.parametrize("filename", test_files)
def test_partition_ndjson_from_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".ndjson")
elements_to_ndjson(elements, filename=test_path)
test_elements = partition_ndjson(filename=test_path)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
@pytest.mark.parametrize("filename", test_files)
def test_partition_ndjson_from_filename_with_metadata_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".ndjson")
elements_to_ndjson(elements, filename=test_path)
test_elements = partition_ndjson(filename=test_path, metadata_filename="test")
assert len(test_elements) > 0
assert len(str(test_elements[0])) > 0
assert all(element.metadata.filename == "test" for element in test_elements)
@pytest.mark.parametrize("filename", test_files)
def test_partition_ndjson_from_file(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".ndjson")
elements_to_ndjson(elements, filename=test_path)
with open(test_path, "rb") as f:
test_elements = partition_ndjson(file=f)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
@pytest.mark.parametrize("filename", test_files)
def test_partition_ndjson_from_file_with_metadata_filename(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".ndjson")
elements_to_ndjson(elements, filename=test_path)
with open(test_path, "rb") as f:
test_elements = partition_ndjson(file=f, metadata_filename="test")
for i in range(len(test_elements)):
assert test_elements[i].metadata.filename == "test"
@pytest.mark.parametrize("filename", test_files)
def test_partition_ndjson_from_text(filename: str):
path = example_doc_path(filename)
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
_filename = os.path.basename(filename)
test_path = os.path.join(tmpdir, _filename + ".ndjson")
elements_to_ndjson(elements, filename=test_path)
with open(test_path) as f:
text = f.read()
test_elements = partition_ndjson(text=text)
assert len(elements) > 0
assert len(str(elements[0])) > 0
assert len(elements) == len(test_elements)
for i in range(len(elements)):
assert elements[i] == test_elements[i]
assert elements[i].metadata.filename == filename.split("/")[-1]
def test_partition_json_raises_with_none_specified():
with pytest.raises(ValueError):
partition_ndjson()
def test_partition_ndjson_works_with_empty_string():
assert partition_ndjson(text="") == []
def test_partition_ndjson_works_with_empty_list():
assert partition_ndjson(text="{}") == []
def test_partition_ndjson_raises_with_too_many_specified():
path = example_doc_path("fake-text.txt")
elements = []
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
elements = partition_html(filename=path)
if filetype == FileType.XML:
elements = partition_xml(filename=path)
if filetype == FileType.EML:
elements = partition_email(filename=path)
with tempfile.TemporaryDirectory() as tmpdir:
test_path = os.path.join(tmpdir, "fake-text.txt.ndjson")
elements_to_ndjson(elements, filename=test_path)
with open(test_path, "rb") as f:
text = f.read().decode("utf-8")
with pytest.raises(ValueError):
partition_ndjson(filename=test_path, file=f)
with pytest.raises(ValueError):
partition_ndjson(filename=test_path, text=text)
with pytest.raises(ValueError):
partition_ndjson(file=f, text=text)
with pytest.raises(ValueError):
partition_ndjson(filename=test_path, file=f, text=text)
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_ndjson_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture):
filesystem_last_modified = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.ndjson.get_last_modified_date",
return_value=filesystem_last_modified,
)
elements = partition_ndjson(example_doc_path("spring-weather.html.ndjson"))
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
def test_partition_ndjson_from_file_gets_last_modified_None():
with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f:
elements = partition_ndjson(file=f)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_ndjson_from_text_gets_last_modified_None():
with open(example_doc_path("spring-weather.html.ndjson")) as f:
text = f.read()
elements = partition_ndjson(text=text)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_ndjson_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
filesystem_last_modified = "2029-07-05T09:24:28"
metadata_last_modified = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.ndjson.get_last_modified_date",
return_value=filesystem_last_modified,
)
elements = partition_ndjson(
example_doc_path("spring-weather.html.ndjson"),
metadata_last_modified=metadata_last_modified,
)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_ndjson_from_file_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f:
elements = partition_ndjson(file=f, metadata_last_modified=metadata_last_modified)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_ndjson_from_text_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
with open(example_doc_path("spring-weather.html.ndjson")) as f:
text = f.read()
elements = partition_ndjson(text=text, metadata_last_modified=metadata_last_modified)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
# ------------------------------------------------------------------------------------------------
def test_partition_json_raises_with_invalid_json():
text = '[{"hi": "there"}]]'
with pytest.raises(ValueError):
partition_ndjson(text=text)

View File

@ -46,7 +46,7 @@ from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.partition.common.common import add_element_metadata, exactly_one
from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
@ -89,7 +89,7 @@ def detect_filetype(
Raises:
ValueError: when:
- `file_path` is specified but does not correspond to a file on the
fileesystem.
filesystem.
- Neither `file_path` nor `file` were specified.
"""
ctx = _FileTypeDetectionContext.new(
@ -123,6 +123,27 @@ def is_json_processable(
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
def is_ndjson_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
"""True when file looks like a JSON array of objects.
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _FileTypeDetectionContext.new(
file_path=filename, file=file, encoding=encoding
).text_head
return re.match(DICT_PATTERN, file_text) is not None
class _FileTypeDetector:
"""Determines file type from a variety of possible inputs."""

View File

@ -288,6 +288,15 @@ class FileType(enum.Enum):
"application/vnd.ms-outlook",
cast(list[str], []),
)
NDJSON = (
"ndjson",
"ndjson",
["ndjson"],
None,
[".ndjson"],
"application/x-ndjson",
cast(list[str], []),
)
ODT = (
"odt",
"odt",

View File

@ -120,6 +120,8 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
# format for document elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
DICT_PATTERN = r"\A\s*{?"
# (?s) dot all (including newline characters)
# \{(?=.*:) opening brace and at least one colon
# .*? any characters (non-greedy)

View File

@ -11,7 +11,11 @@ import requests
from typing_extensions import TypeAlias
from unstructured.documents.elements import DataSourceMetadata, Element
from unstructured.file_utils.filetype import detect_filetype, is_json_processable
from unstructured.file_utils.filetype import (
detect_filetype,
is_json_processable,
is_ndjson_processable,
)
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common import UnsupportedFileFormatError
@ -244,6 +248,16 @@ def partition(
elements = partition_json(filename=filename, file=file, **kwargs)
return augment_metadata(elements)
if file_type == FileType.NDJSON:
if not is_ndjson_processable(filename=filename, file=file):
raise ValueError(
"Detected an NDJSON file that does not conform to the Unstructured schema. "
"partition_json currently only processes serialized Unstructured output.",
)
partition_ndjson = partitioner_loader.get(file_type)
elements = partition_ndjson(filename=filename, file=file, **kwargs)
return augment_metadata(elements)
# -- EMPTY is also a special case because while we can't determine the file type, we can be
# -- sure it doesn't contain any elements.
if file_type == FileType.EMPTY:

View File

@ -0,0 +1,85 @@
"""Provides `partition_ndjson()`.
Note this does not partition arbitrary NDJSON. Its only use-case is to "rehydrate" unstructured
document elements serialized to JSON, essentially the same function as `elements_from_json()`, but
this allows a document of already-partitioned elements to be combined transparently with other
documents in a partitioning run. It also allows multiple (low-cost) chunking runs to be performed on
a document while only incurring partitioning cost once.
"""
from __future__ import annotations
import json
from typing import IO, Any, Optional
import ndjson
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
is_ndjson_processable,
)
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.staging.base import elements_from_dicts
@process_metadata()
@add_metadata_with_filetype(FileType.NDJSON)
@add_chunking_strategy
def partition_ndjson(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions serialized Unstructured output into its constituent elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
text
The string representation of the .json document.
metadata_last_modified
The last modified date for the document.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
exactly_one(filename=filename, file=file, text=text)
last_modified = get_last_modified_date(filename) if filename else None
file_text = ""
if filename is not None:
with open(filename, encoding="utf8") as f:
file_text = f.read()
elif file is not None:
file_content = file.read()
file_text = file_content if isinstance(file_content, str) else file_content.decode()
file.seek(0)
elif text is not None:
file_text = str(text)
if not is_ndjson_processable(file_text=file_text):
raise ValueError(
"NDJSON cannot be partitioned. Schema does not match the Unstructured schema.",
)
try:
element_dicts = ndjson.loads(file_text)
elements = elements_from_dicts(element_dicts)
except json.JSONDecodeError:
raise ValueError("Not a valid ndjson")
for element in elements:
element.metadata.last_modified = metadata_last_modified or last_modified
return elements

View File

@ -9,6 +9,8 @@ from copy import deepcopy
from datetime import datetime
from typing import Any, Iterable, Optional, Sequence, cast
import ndjson
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
@ -152,6 +154,29 @@ def elements_to_json(
return json_str
def elements_to_ndjson(
elements: Iterable[Element],
filename: Optional[str] = None,
encoding: str = "utf-8",
) -> str:
"""Serialize `elements` to a JSON array.
Also writes the JSON to `filename` if it is provided, encoded using `encoding`.
The JSON is returned as a string.
"""
# -- serialize `elements` as a JSON array (str) --
precision_adjusted_elements = _fix_metadata_field_precision(elements)
element_dicts = elements_to_dicts(precision_adjusted_elements)
ndjson_str = ndjson.dumps(element_dicts, sort_keys=True)
if filename is not None:
with open(filename, "w", encoding=encoding) as f:
f.write(ndjson_str)
return ndjson_str
def _fix_metadata_field_precision(elements: Iterable[Element]) -> list[Element]:
out_elements: list[Element] = []
for element in elements: