mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary** Adds logic to combine broken numbered list for pdf fast strategy. **Details** Previously the document reads the numbered list items part of the `layout-parser-paper-fast.pdf` file as: ``` '1. An off-the-shelf toolkit for applying DL models for layout detection, character' 'recognition, and other DIA tasks (Section 3)' '2. A rich repository of pre-trained neural network models (Model Zoo) that' 'underlies the off-the-shelf usage' '3. Comprehensive tools for efficient document image data annotation and model' 'tuning to support different levels of customization' '4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)' ``` Now it reads: ``` '1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)' '2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage' '3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization' '4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)' ``` The added logic leverages `ElementType` and `coordinates` to determine whether the following lines is a part of the previously detected `ListItem` or not. **Test** Add test that checks the element length less than original version with broken numbered list. The test also checks whether the first detected numbered list ends with previously broken line. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
This commit is contained in:
parent
d87c83d7b6
commit
00181b88df
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Adds numbered ListItem grouping when pdfminer broke down by line-by-line using coordinates
|
||||
* Use text-based classification hen elements come back uncategorized from PDF/Image partitioning
|
||||
* Updated HTML Partitioning to extract tables
|
||||
* Create and add `add_chunking_strategy` decorator to partition functions
|
||||
|
||||
@ -11,6 +11,7 @@ from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
CoordinatesMetadata,
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
@ -835,3 +836,18 @@ def test_partition_categorization_backup():
|
||||
# Should have changed the element class from Text to Title
|
||||
assert isinstance(elements[0], Title)
|
||||
assert elements[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["example-docs/layout-parser-paper-fast.pdf"],
|
||||
)
|
||||
def test_combine_numbered_list(filename):
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
||||
first_list_element = None
|
||||
for element in elements:
|
||||
if isinstance(element, ListItem):
|
||||
first_list_element = element
|
||||
break
|
||||
assert len(elements) < 28
|
||||
assert first_list_element.text.endswith("(Section 3)")
|
||||
|
||||
@ -290,7 +290,7 @@
|
||||
"text": "Suggested high-level core skills include:"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "8f90f5970c85f335b1bf50af611ce5c5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -300,7 +300,7 @@
|
||||
"text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "d1a5bb898aee8de0fbdf048c7a9fb01d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -320,7 +320,7 @@
|
||||
"text": "least one programming language (typically R and/or Python);"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "c6be5389b7bd00746d39b7bac468dea0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -330,7 +330,7 @@
|
||||
"text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "1b8039583cbc15f654c89f2141eb6e10",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -340,7 +340,7 @@
|
||||
"text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "2f87757b1d497a32c077be543632ed7d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -72,7 +72,7 @@
|
||||
"text": "The TELNET Protocol is built upon three main ideas: first, the concept of a \"Network Virtual Terminal\"; second, the principle of negotiated options; and third, a symmetric view of terminals and processes."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "f2612ca5f313c20b7f83b278ae4a7f8c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -117,7 +117,7 @@
|
||||
"text": "applicable even in terminal-to-terminal or process-to-process communications, the \"user\" host is the host which initiated the communication."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "319d6358cf6e682f16e83ac6d588a9bc",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -144,7 +144,7 @@
|
||||
"text": "The syntax of option negotiation has been set up so that if both parties request an option simultaneously, each will see the other's request as the positive acknowledgment of its own."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "d38748f70330c37b70fa1dc04cf1a3a1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -351,7 +351,7 @@
|
||||
"text": "options are negotiated to the contrary, the following default conditions pertain to the transmission of data over the TELNET connection:"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "8aa60fbfa7fdb5de8d562608d3ccb069",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -369,7 +369,7 @@
|
||||
"text": "The motivation for this rule is the high cost, to some hosts, of processing network input interrupts, coupled with the default NVT specification that \"echoes\" do not traverse the network. Thus, it is reasonable to buffer some amount of data at its source. Many systems take some processing action at the end of each input line (even line printers or card punches frequently tend to work this way), so the transmission should be triggered at the end of a line. On the other hand, a user or process may sometimes find it necessary or desirable to provide data which does not terminate at the end of a line; therefore implementers are cautioned to provide methods of locally signaling that all buffered data should be transmitted immediately."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "24c6236ce1666a2ef74bd5883a9d95d5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -765,7 +765,7 @@
|
||||
"text": "By convention the sequence [IP, Synch] is to be used as such a signal. For example, suppose that some other protocol, which uses TELNET, defines the character string STOP analogously to the TELNET command AO. Imagine that a user of this protocol wishes a server to process the STOP string, but the connection is blocked because the server is processing other commands. The user should instruct his system to:"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "8459ff8791582f54cd1435d6b9d770db",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -774,7 +774,7 @@
|
||||
"text": "1. Send the TELNET IP character;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "e0bfbeb5fb2c2920562cfd6eace856e5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -792,7 +792,7 @@
|
||||
"text": "Send the Data Mark (DM) as the only character in a TCP urgent mode send operation."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "82bd9d7fe0aa06d113db3be28abbda8e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -801,7 +801,7 @@
|
||||
"text": "3. Send the character string STOP; and"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "618808c48ecd9bdc1ac5a97e633131b9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -1530,7 +1530,7 @@
|
||||
"text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "a80826543c9e0d0e9f6c2108ae3c3f73",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -320,7 +320,7 @@
|
||||
"text": "be used for the comparison."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -600,7 +600,7 @@
|
||||
"text": "must"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -960,7 +960,7 @@
|
||||
"text": "i , the start"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "764eef872135149aaf95224bab69c844",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -200,7 +200,7 @@
|
||||
"text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "074b2bd4ba1bf0caf3dbf1973217416a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -210,7 +210,7 @@
|
||||
"text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "569ce8891b02bc38f50a0cde0039e951",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -220,7 +220,7 @@
|
||||
"text": "2. A rich repository of pre-trained neural network models (Model Zoo) that"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "18dcbc2839f9783d2c91cbce75d3e685",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -230,7 +230,7 @@
|
||||
"text": "3. Comprehensive tools for efficient document image data annotation and model"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "efe6ba3afae54e3c7a05d81583543296",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
"text": "RULES AND INSTRUCTIONS"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "ba41648485acf4a8e7dd7d183b764811",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -36,7 +36,7 @@
|
||||
"text": "안녕하세요, 저 희 는 YGEAS 그룹 TREASUREMH HARUTOM| 2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 메 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 고 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "95c5b9d0d081bc45be0e50a109924191",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -45,7 +45,7 @@
|
||||
"text": "3. CC Harutonations@gmail.com so we can keep track of how many emails were successfully sent"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "c37248b6d436997d36acc7852f502a8e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
"text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "8f90f5970c85f335b1bf50af611ce5c5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -70,7 +70,7 @@
|
||||
"text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "d1a5bb898aee8de0fbdf048c7a9fb01d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -90,7 +90,7 @@
|
||||
"text": "least one programming language (typically R and/or Python);"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "c6be5389b7bd00746d39b7bac468dea0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -100,7 +100,7 @@
|
||||
"text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "1b8039583cbc15f654c89f2141eb6e10",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -110,7 +110,7 @@
|
||||
"text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "2f87757b1d497a32c077be543632ed7d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -330,7 +330,7 @@
|
||||
"text": "nature of inhibition of metals."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -1430,7 +1430,7 @@
|
||||
"text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "a80826543c9e0d0e9f6c2108ae3c3f73",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -310,7 +310,7 @@
|
||||
"text": "be used for the comparison."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -560,24 +560,14 @@
|
||||
"text": "a ls"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"type": "ListItem",
|
||||
"element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "e46a5a30f05d06e82d8b7d10448de683",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "the depot."
|
||||
"text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -900,7 +890,7 @@
|
||||
"text": "i , the start"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "764eef872135149aaf95224bab69c844",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -57,17 +57,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "74180a93b38b6808f8cff7439e5d16d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress."
|
||||
"text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -77,17 +67,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9f5a3fe548f011e304fda9067caa0824",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment."
|
||||
"text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -210,7 +190,7 @@
|
||||
"text": "Jan. 2019"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "63e35649dd179389ecc7251e1503489a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -220,7 +200,7 @@
|
||||
"text": "1. Headline Inflation"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "b790ab5fcad28bbedb50b568b3adeca2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -517,17 +497,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "cdcaed7d1296edd658256d603cb3828c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers."
|
||||
"text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -537,17 +507,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7e32067b6a4662d72b1244a3aac91be5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets."
|
||||
"text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -557,17 +517,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f8b94e8d9a593a1debae96fce2040db7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate."
|
||||
"text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -587,17 +537,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "237bc02ecaaf27f074be0c466b31cc09",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024."
|
||||
"text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -1647,17 +1587,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "814da89798a14bd42df7575af3ffee55",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism."
|
||||
"text": "Pent-up demand boost: Fueled by the stock of excess private savings from the pandemic fiscal support and, in many cases, still-tight labor markets and solid wage growth, pent-up demand remains an upside risk to the growth outlook. In some advanced economies, recent data show that households are still on net adding to their stock of excess savings (as in some euro area countries and the United Kingdom) or have ample savings left (as in the United States). This leaves scope for a further boost to consumption—particularly of services, including tourism."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -1677,17 +1607,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "3f9155fad634c620bd9b820132e20935",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening."
|
||||
"text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -1707,17 +1627,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "692a9a6c2925baa45a1884cbfe510240",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems."
|
||||
"text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -1727,17 +1637,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "fdb59d523afa92db3942dabc88d94fc4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase."
|
||||
"text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -1777,17 +1677,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "810e5a86eae657e179ac8da86f317a62",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 8
|
||||
},
|
||||
"text": "earlier geopolitical tensions, such as those associated with the US-China trade dispute."
|
||||
"text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -1897,17 +1787,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "add6f9f296b6a99cf0ef86162b3c9cfc",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes."
|
||||
"text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -1917,17 +1797,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "Strengthening global trade: Strengthening the global trading system would address risks associated"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "1a3ec03df542f6e1c2576eba1adb11d9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system."
|
||||
"text": "Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -1947,17 +1817,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "77ac1fdd449fba59a90d978745964463",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 10
|
||||
},
|
||||
"text": "implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries."
|
||||
"text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -2260,7 +2120,7 @@
|
||||
"text": "6"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "7d4f55875c970d850a152ba1d5ba02a5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -2270,7 +2130,7 @@
|
||||
"text": "1. United States"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "8e655408cf212df5f74df13e05cdf02c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -240,7 +240,7 @@
|
||||
"text": "Jan. 2019"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "63e35649dd179389ecc7251e1503489a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -250,7 +250,7 @@
|
||||
"text": "1. Headline Inflation"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "b790ab5fcad28bbedb50b568b3adeca2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -2930,7 +2930,7 @@
|
||||
"text": "Apr. 23"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "7d4f55875c970d850a152ba1d5ba02a5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
@ -3040,7 +3040,7 @@
|
||||
"text": "October 2022 GFSR"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "ListItem",
|
||||
"element_id": "8e655408cf212df5f74df13e05cdf02c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
|
||||
@ -145,3 +145,8 @@ IMAGE_URL_PATTERN = (
|
||||
r"(?:/[a-z0-9$_@.&+!*\\(\\),%-]*)*"
|
||||
r"\.(?:jpg|jpeg|png|gif|bmp)"
|
||||
)
|
||||
|
||||
# NOTE(klaijan) - only supports one level numbered list for now
|
||||
# e.g. 1. 2. 3. or 1) 2) 3), not 1.1 1.2 1.3
|
||||
NUMBERED_LIST_PATTERN = r"^\d+(\.|\))\s(.+)"
|
||||
NUMBERED_LIST_RE = re.compile(NUMBERED_LIST_PATTERN)
|
||||
|
||||
@ -18,6 +18,7 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Image,
|
||||
ListItem,
|
||||
PageBreak,
|
||||
Text,
|
||||
process_metadata,
|
||||
@ -43,7 +44,10 @@ from unstructured.partition.lang import (
|
||||
from unstructured.partition.strategies import determine_pdf_or_image_strategy
|
||||
from unstructured.partition.text import element_from_text, partition_text
|
||||
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
|
||||
from unstructured.partition.utils.sorting import sort_page_elements
|
||||
from unstructured.partition.utils.sorting import (
|
||||
coord_has_valid_points,
|
||||
sort_page_elements,
|
||||
)
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
|
||||
@ -475,6 +479,50 @@ def _process_pdfminer_pages(
|
||||
last_modified=metadata_last_modified,
|
||||
)
|
||||
page_elements.append(element)
|
||||
list_item = 0
|
||||
updated_page_elements = [] # type: ignore
|
||||
coordinate_system = PixelSpace(width=width, height=height)
|
||||
for page_element in page_elements:
|
||||
if isinstance(page_element, ListItem):
|
||||
list_item += 1
|
||||
list_page_element = page_element
|
||||
list_item_text = page_element.text
|
||||
list_item_coords = page_element.metadata.coordinates
|
||||
elif list_item > 0 and check_coords_within_boundary(
|
||||
page_element.metadata.coordinates,
|
||||
list_item_coords,
|
||||
):
|
||||
text = page_element.text # type: ignore
|
||||
list_item_text = list_item_text + " " + text
|
||||
x1 = min(
|
||||
list_page_element.metadata.coordinates.points[0][0],
|
||||
page_element.metadata.coordinates.points[0][0],
|
||||
)
|
||||
x2 = max(
|
||||
list_page_element.metadata.coordinates.points[2][0],
|
||||
page_element.metadata.coordinates.points[2][0],
|
||||
)
|
||||
y1 = min(
|
||||
list_page_element.metadata.coordinates.points[0][1],
|
||||
page_element.metadata.coordinates.points[0][1],
|
||||
)
|
||||
y2 = max(
|
||||
list_page_element.metadata.coordinates.points[1][1],
|
||||
page_element.metadata.coordinates.points[1][1],
|
||||
)
|
||||
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||
list_page_element.text = list_item_text
|
||||
list_page_element.metadata.coordinates = CoordinatesMetadata(
|
||||
points=points,
|
||||
system=coordinate_system,
|
||||
)
|
||||
page_element = list_page_element
|
||||
updated_page_elements.pop()
|
||||
|
||||
updated_page_elements.append(page_element)
|
||||
|
||||
page_elements = updated_page_elements
|
||||
del updated_page_elements
|
||||
|
||||
# NOTE(crag, christine): always do the basic sort first for determinsitic order across
|
||||
# python versions.
|
||||
@ -650,3 +698,44 @@ def _partition_pdf_or_image_with_ocr(
|
||||
if include_page_breaks:
|
||||
elements.append(PageBreak(text=""))
|
||||
return elements
|
||||
|
||||
|
||||
def check_coords_within_boundary(
|
||||
coordinates: CoordinatesMetadata,
|
||||
boundary: CoordinatesMetadata,
|
||||
horizontal_threshold: float = 0.2,
|
||||
vertical_threshold: float = 0.3,
|
||||
) -> bool:
|
||||
"""Checks if the coordinates are within boundary thresholds.
|
||||
Parameters
|
||||
----------
|
||||
coordinates
|
||||
a CoordinatesMetadata input
|
||||
boundary
|
||||
a CoordinatesMetadata to compare against
|
||||
vertical_threshold
|
||||
a float ranges from [0,1] to scale the vertical (y-axis) boundary
|
||||
horizontal_threshold
|
||||
a float ranges from [0,1] to scale the horizontal (x-axis) boundary
|
||||
"""
|
||||
if not coord_has_valid_points(coordinates) and not coord_has_valid_points(boundary):
|
||||
raise ValueError("Invalid coordinates.")
|
||||
|
||||
boundary_x_min = boundary.points[0][0]
|
||||
boundary_x_max = boundary.points[2][0]
|
||||
boundary_y_min = boundary.points[0][1]
|
||||
boundary_y_max = boundary.points[1][1]
|
||||
|
||||
line_width = boundary_x_max - boundary_x_min
|
||||
line_height = boundary_y_max - boundary_y_min
|
||||
|
||||
x_within_boundary = (
|
||||
(coordinates.points[0][0] < boundary_x_min + (horizontal_threshold * line_width))
|
||||
and (coordinates.points[2][0] < boundary_x_max + (horizontal_threshold * line_width))
|
||||
and (coordinates.points[0][0] >= boundary_x_min)
|
||||
)
|
||||
y_within_boundary = (
|
||||
coordinates.points[0][1] < boundary_y_max + (vertical_threshold * line_height)
|
||||
) and (coordinates.points[0][1] > boundary_y_min)
|
||||
|
||||
return x_within_boundary and y_within_boundary
|
||||
|
||||
@ -32,6 +32,7 @@ from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_numbered_list,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
@ -277,6 +278,12 @@ def element_from_text(
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
elif is_possible_numbered_list(text):
|
||||
return ListItem(
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
elif is_possible_narrative_text(text):
|
||||
return NarrativeText(
|
||||
text=text,
|
||||
|
||||
@ -15,6 +15,7 @@ from unstructured.nlp.english_words import ENGLISH_WORDS
|
||||
from unstructured.nlp.patterns import (
|
||||
EMAIL_ADDRESS_PATTERN_RE,
|
||||
ENDS_IN_PUNCT_RE,
|
||||
NUMBERED_LIST_RE,
|
||||
UNICODE_BULLETS_RE,
|
||||
US_CITY_STATE_ZIP_RE,
|
||||
US_PHONE_NUMBERS_RE,
|
||||
@ -308,3 +309,8 @@ def is_us_city_state_zip(text) -> bool:
|
||||
def is_email_address(text) -> bool:
|
||||
"""Check if the given text is the email address"""
|
||||
return EMAIL_ADDRESS_PATTERN_RE.match(text.strip()) is not None
|
||||
|
||||
|
||||
def is_possible_numbered_list(text) -> bool:
|
||||
"""Checks to see if the text is a potential numbered list."""
|
||||
return NUMBERED_LIST_RE.match(text.strip()) is not None
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user