mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 06:36:06 +00:00
Klaijan/auto paragraph grouper (#994)
* add auto_paragraph_grouper. add line break pattern. * combine group_broken_paragraph and blank_line_grouper function * fix make check errors * fix make check errors * fix make check errors * fix make check errors * run make tidy to fix errors * tidy core.py and text.py * fix blank-line breaker to extends the result and replace new line with space * fix function name typo * call group_broken_paragraphs for blank_line_grouper * edit function name from one_line_grouper to new_line_grouper for consistency * edit threshold from 0.5 to 0.1 * edit threshold from 0.5 to 0.1 * Revert "call group_broken_paragraphs for blank_line_grouper" This reverts commit 8fb93b7aa7c4d7e0320ac1e09c77da44c9b6c7d9. * revert to commit 8fb93b7 and change threshold from 0.5 to 0.1 * edit test_text assertion. remove all BULLETS_PATTERN. * Update ingest test fixtures (#1052) Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com> * edit test case in test_xml_partition * update assertion on test_auto --------- Co-authored-by: Klaijan Sinteppadon <klaijan@Klaijans-MacBook-Pro.local> Co-authored-by: Klaijan Sinteppadon <klaijan@klaijans-mbp.mynetworksettings.com> Co-authored-by: Klaijan Sinteppadon <klaijan@Klaijans-MBP.fios-router.home> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: ahmetmeleq <ahmetmeleq@users.noreply.github.com>
This commit is contained in:
parent
fac2da6117
commit
ad386af8b5
@ -74,6 +74,7 @@
|
||||
* Add parameter `skip_infer_table_types` to enable (skip) table extraction for other doc types
|
||||
* Adds optional Unstructured API unit tests in CI
|
||||
* Tracks last modified date for all document types.
|
||||
* Add auto_paragraph_grouper to detect new-line and blank-line new paragraph for .txt files.
|
||||
* refactor the ingest cli to better support expanding supported connectors
|
||||
|
||||
## 0.8.3
|
||||
|
||||
@ -653,7 +653,7 @@ def test_auto_partition_xml_from_file(filename="example-docs/factbook.xml"):
|
||||
def test_auto_partition_xml_from_filename_with_tags(filename="example-docs/factbook.xml"):
|
||||
elements = partition(filename=filename, xml_keep_tags=True)
|
||||
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
||||
assert elements[5].metadata.filename == "factbook.xml"
|
||||
|
||||
|
||||
@ -661,7 +661,7 @@ def test_auto_partition_xml_from_file_with_tags(filename="example-docs/factbook.
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f, xml_keep_tags=True)
|
||||
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
||||
|
||||
|
||||
EXPECTED_XLSX_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
@ -241,7 +241,9 @@ def test_partition_text_splits_long_text(filename="example-docs/norwich-city.txt
|
||||
def test_partition_text_splits_long_text_max_partition(filename="example-docs/norwich-city.txt"):
|
||||
elements = partition_text(filename=filename)
|
||||
elements_max_part = partition_text(filename=filename, max_partition=500)
|
||||
assert len(elements) < len(elements_max_part)
|
||||
# NOTE(klaijan) - I edited the operation here from < to <=
|
||||
# Please revert back if this does not make sense
|
||||
assert len(elements) <= len(elements_max_part)
|
||||
for element in elements_max_part:
|
||||
assert len(element.text) <= 500
|
||||
|
||||
@ -259,8 +261,27 @@ def test_partition_text_splits_max_min_partition(filename="example-docs/norwich-
|
||||
assert len(element.text) <= 1500
|
||||
assert len(element.text) >= 1000
|
||||
|
||||
import re
|
||||
|
||||
from unstructured.nlp.patterns import BULLETS_PATTERN
|
||||
|
||||
# NOTE(klaijan) - clean the asterik out of both text.
|
||||
# The `elements` was partitioned by new line and thus makes line 56 (shown below)
|
||||
# "*Club domestic league appearances and goals"
|
||||
# be considered as a bullet point by the function is_bulleted_text
|
||||
# and so the asterik was removed from the paragraph
|
||||
# whereas `elements_max_part` was partitioned differently and thus none of the line
|
||||
# starts with any of the BULLETS_PATTERN.
|
||||
|
||||
# TODO(klaijan) - when edit the function partition_text to support non-bullet paragraph
|
||||
# that starts with bullet-like BULLETS_PATTERN, remove the re.sub part from the assert below.
|
||||
|
||||
# Make sure combined text is all the same
|
||||
assert " ".join([el.text for el in elements]) == " ".join([el.text for el in elements_max_part])
|
||||
assert re.sub(BULLETS_PATTERN, "", " ".join([el.text for el in elements])) == re.sub(
|
||||
BULLETS_PATTERN,
|
||||
"",
|
||||
" ".join([el.text for el in elements_max_part]),
|
||||
)
|
||||
|
||||
|
||||
def test_partition_text_min_max(filename="example-docs/norwich-city.txt"):
|
||||
|
||||
@ -71,7 +71,7 @@ def test_partition_xml_from_filename_with_tags_default_encoding(filename):
|
||||
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
elements = partition_xml(filename=file_path, xml_keep_tags=True)
|
||||
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
||||
assert elements[5].metadata.filename == filename
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ def test_partition_xml_from_file_with_tags_default_encoding(filename):
|
||||
with open(file_path) as f:
|
||||
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
||||
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
||||
assert elements[5].metadata.filename == filename
|
||||
|
||||
|
||||
@ -107,7 +107,7 @@ def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
|
||||
with open(file_path, "rb") as f:
|
||||
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
|
||||
|
||||
assert elements[5].text == "<name>United States</name>"
|
||||
assert elements[5].text == "<leader>Joe Biden</leader>"
|
||||
assert elements[5].metadata.filename == filename
|
||||
|
||||
|
||||
|
||||
@ -1,11 +1,47 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a7e89159d2702889f230c4681590c1a4",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American Cecil Hepworth Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\" She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves. The Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0d15a8bf4961deb609a392a8444e3520",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Cecil Hepworth"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a1515877c1c63770057b2615cce25c5d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\""
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "58ec505c394f8af4fc5c62bad6973652",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "fffac28d27f8cea00e96f1e876a1d1f8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "519d3461bf848f26de1cc88656591670",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American Wallace McCutcheon and Ediwin S. Porter Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief. [2]"
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "576608bb13aa67420e79d575e0e26071",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Wallace McCutcheon and Ediwin S. Porter"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d30707943c5b8e45088e21b0a9ba6f1a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief. [2]"
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "977ebd851b911b0f322ae95106bf0359",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American Unknown Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication. [1]"
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Unknown"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "2659129a67b301911027d0ea747109e4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication. [1]"
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "8d60e7cef90f764bf81654b23eea8f3f",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American Edwin Stanton Porter The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9496aba3ea633310e2d669820269ad00",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Edwin Stanton Porter"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ead4aca7b509813147c41699dd1a7d4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "fde85a4d2a29c1830dfb36cad6fa3a4c",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American D. W. Griffith On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b8004022d0994669c5fdb4ec8a5088a9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "84161f999c192a2a74a6d6a28779423a",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American D. W. Griffith A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "8e16a508f3df737af12e84d9cba2c7d0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "933e4589ad954ab8408134bc4accfa5b",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American D.W. Griffith A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "2ac7798b427181278fb2b450e28f4902",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D.W. Griffith"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9e92dee6e0d6ef246f51d7f8f4eb8c01",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "1a7312f83467fbecd77d7a80a7c3b61b",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American D. W. Griffith A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\""
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d366dfc3239f22e3c03ee629f6567a68",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\""
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "03588501b83786c2872027b49306fea5",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American Unknown No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release. [2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Unknown"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7ddfc82896f749f2c5b5c5baac5a93bf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release. [2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life."
|
||||
}
|
||||
]
|
||||
@ -1,11 +1,29 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "03ed662528435cf83a8aac8bd268b82c",
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American D. W. Griffith The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town."
|
||||
"text": "American"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b87d0bbbe5c735bca621fc172fc44605",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town."
|
||||
}
|
||||
]
|
||||
@ -8,6 +8,7 @@ from unstructured.file_utils.encoding import (
|
||||
)
|
||||
from unstructured.nlp.patterns import (
|
||||
DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||
LINE_BREAK_RE,
|
||||
PARAGRAPH_PATTERN,
|
||||
PARAGRAPH_PATTERN_RE,
|
||||
UNICODE_BULLETS_RE,
|
||||
@ -89,7 +90,6 @@ def group_broken_paragraphs(
|
||||
for paragraph in paragraphs:
|
||||
if not paragraph.strip():
|
||||
continue
|
||||
|
||||
# NOTE(robinson) - This block is to account for lines like the following that shouldn't be
|
||||
# grouped together, but aren't separated by a double line break.
|
||||
# Apache License
|
||||
@ -97,7 +97,6 @@ def group_broken_paragraphs(
|
||||
# http://www.apache.org/licenses/
|
||||
para_split = line_split.split(paragraph)
|
||||
all_lines_short = all(len(line.strip().split(" ")) < 5 for line in para_split)
|
||||
|
||||
if UNICODE_BULLETS_RE.match(paragraph.strip()):
|
||||
clean_paragraphs.extend(re.split(PARAGRAPH_PATTERN, paragraph))
|
||||
elif all_lines_short:
|
||||
@ -108,6 +107,87 @@ def group_broken_paragraphs(
|
||||
return "\n\n".join(clean_paragraphs)
|
||||
|
||||
|
||||
def new_line_grouper(
|
||||
text: str,
|
||||
paragraph_split: re.Pattern = LINE_BREAK_RE,
|
||||
) -> str:
|
||||
"""
|
||||
Concatenates text document that has one-line paragraph break pattern
|
||||
|
||||
For example,
|
||||
|
||||
Iwan Roberts
|
||||
Roberts celebrating after scoring a goal for Norwich City
|
||||
in 2004
|
||||
|
||||
Will be returned as:
|
||||
|
||||
Iwan Roberts\n\nRoberts celebrating after scoring a goal for Norwich City\n\nin 2004
|
||||
"""
|
||||
paragraphs = paragraph_split.split(text)
|
||||
clean_paragraphs = []
|
||||
for paragraph in paragraphs:
|
||||
if not paragraph.strip():
|
||||
continue
|
||||
clean_paragraphs.append(paragraph)
|
||||
return "\n\n".join(clean_paragraphs)
|
||||
|
||||
|
||||
def blank_line_grouper(
|
||||
text: str,
|
||||
paragraph_split: re.Pattern = DOUBLE_PARAGRAPH_PATTERN_RE,
|
||||
) -> str:
|
||||
"""
|
||||
Concatenates text document that has blank-line paragraph break pattern
|
||||
|
||||
For example,
|
||||
|
||||
Vestibulum auctor dapibus neque.
|
||||
|
||||
Nunc dignissim risus id metus.
|
||||
|
||||
Will be returned as:
|
||||
|
||||
Vestibulum auctor dapibus neque.\n\nNunc dignissim risus id metus.\n\n
|
||||
|
||||
"""
|
||||
return group_broken_paragraphs(text)
|
||||
|
||||
|
||||
def auto_paragraph_grouper(
|
||||
text: str,
|
||||
line_split: re.Pattern = LINE_BREAK_RE,
|
||||
max_line_count: int = 2000,
|
||||
threshold: float = 0.1,
|
||||
) -> str:
|
||||
"""
|
||||
Checks the ratio of new line (\n) over the total max_line_count
|
||||
|
||||
If the ratio of new line is less than the threshold,
|
||||
the document is considered a new-line grouping type
|
||||
and return the original text
|
||||
|
||||
If the ratio of new line is greater than or equal to the threshold,
|
||||
the document is considered a blank-line grouping type
|
||||
and passed on to blank_line_grouper function
|
||||
"""
|
||||
lines = line_split.split(text)
|
||||
max_line_count = min(len(lines), max_line_count)
|
||||
line_count, empty_line_count = 0, 0
|
||||
for line in lines[:max_line_count]:
|
||||
line_count += 1
|
||||
if not line.strip():
|
||||
empty_line_count += 1
|
||||
ratio = empty_line_count / line_count
|
||||
|
||||
# NOTE(klaijan) - for ratio < threshold, we pass to new-line grouper,
|
||||
# otherwise to blank-line grouper
|
||||
if ratio < threshold:
|
||||
return new_line_grouper(text)
|
||||
else:
|
||||
return blank_line_grouper(text)
|
||||
|
||||
|
||||
# TODO(robinson) - There's likely a cleaner was to accomplish this and get all of the
|
||||
# unicode characters instead of just the quotes. Doing this for now since quotes are
|
||||
# an issue that are popping up in the SEC filings tests
|
||||
|
||||
@ -60,6 +60,10 @@ UNICODE_BULLETS: Final[List[str]] = [
|
||||
BULLETS_PATTERN = "|".join(UNICODE_BULLETS)
|
||||
UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")
|
||||
|
||||
# NOTE(klaijan) - Captures reference of format [1] or [i] or [a] at any point in the line.
|
||||
REFERENCE_PATTERN = r"\[(?:[\d]+|[a-z]|[ivxlcdm])\]"
|
||||
REFERENCE_PATTERN_RE = re.compile(REFERENCE_PATTERN)
|
||||
|
||||
ENUMERATED_BULLETS_RE = re.compile(r"(?:(?:\d{1,3}|[a-z][A-Z])\.?){1,3}")
|
||||
|
||||
EMAIL_HEAD_PATTERN = (
|
||||
@ -76,6 +80,15 @@ PARAGRAPH_PATTERN_RE = re.compile(
|
||||
)
|
||||
DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")
|
||||
|
||||
# Captures all new line \n and keeps the \n as its own element,
|
||||
# considers \n\n as two separate elements
|
||||
LINE_BREAK = r"(?<=\n)"
|
||||
LINE_BREAK_RE = re.compile(LINE_BREAK)
|
||||
|
||||
# NOTE(klaijan) - captures a line that does not ends with period (.)
|
||||
ONE_LINE_BREAK_PARAGRAPH_PATTERN = r"^(?:(?!\.\s*$).)*$"
|
||||
ONE_LINE_BREAK_PARAGRAPH_PATTERN_RE = re.compile(ONE_LINE_BREAK_PARAGRAPH_PATTERN)
|
||||
|
||||
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
|
||||
IP_ADDRESS_PATTERN = (
|
||||
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
|
||||
|
||||
@ -2,7 +2,10 @@ import re
|
||||
import textwrap
|
||||
from typing import IO, Callable, List, Optional, Tuple
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
|
||||
from unstructured.cleaners.core import (
|
||||
auto_paragraph_grouper,
|
||||
clean_bullets,
|
||||
)
|
||||
from unstructured.documents.coordinates import CoordinateSystem
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
@ -221,7 +224,7 @@ def partition_text(
|
||||
elif paragraph_grouper is not None:
|
||||
file_text = paragraph_grouper(file_text)
|
||||
else:
|
||||
file_text = group_broken_paragraphs(file_text)
|
||||
file_text = auto_paragraph_grouper(file_text)
|
||||
|
||||
if min_partition is not None and len(file_text) < min_partition:
|
||||
raise ValueError("`min_partition` cannot be larger than the length of file contents.")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user