add support for start_index in html links extraction (#2600)

add support for start_index in html links extraction (closes #2625)

Testing
```
from unstructured.partition.html import partition_html
from unstructured.staging.base import elements_to_json


html_text = """<html>
        <p>Hello there I am a <a href="/link">very important link!</a></p>
        <p>Here is a list of my favorite things</p>
        <ul>
            <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
            <li>Dogs</li>
        </ul>
        <a href="/loner">A lone link!</a>
    </html>"""

elements = partition_html(text=html_text)
print(elements_to_json(elements))
```

---------

Co-authored-by: Michael Niestroj <michael.niestroj@unblu.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
This commit is contained in:
MiXiBo 2024-04-12 08:14:20 +02:00 committed by GitHub
parent 3e643c4cb3
commit 0506aff788
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 330 additions and 12 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* **Add support for `start_index` in `html` links extraction**
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.

View File

@ -353,6 +353,49 @@ def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[
assert emphasized_texts == expected
@pytest.mark.parametrize(
("doc", "root", "expected"),
[
(
"<a href='/loner'>A lone link!</a>",
"a",
[{"text": "A lone link!", "url": "/loner", "start_index": -1}],
),
(
"<ul><li><a href='/wiki/Parrot'>Parrots</a></li><li>Dogs</li></ul>",
"ul",
[{"text": "Parrots", "url": "/wiki/Parrot", "start_index": 0}],
),
(
"<ul><li><a href='/parrot'>Parrots</a></li><li><a href='/dog'>Dogs</a></li></ul>",
"ul",
[
{"text": "Parrots", "url": "/parrot", "start_index": 0},
{"text": "Dogs", "url": "/dog", "start_index": 7},
],
),
(
"<div>Here is <p>P tag</p> tail text. <a href='/link'>link!</a></div>",
"div",
[{"text": "link!", "url": "/link", "start_index": 25}],
),
(
"<div>Here is <p>P tag</p><a href='/link'>link!</a></div>",
"div",
[{"text": "link!", "url": "/link", "start_index": 13}],
),
],
)
def test_get_links_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):
document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(f".//{root}")
assert el is not None
links = html._get_links_from_tag(el)
assert links == expected
def test_parse_nothing():
doc = """<p></p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser())

View File

@ -769,3 +769,39 @@ def test_partition_html_tag_tail_parsing():
element_text = "|".join([str(el).strip() for el in elements])
assert element_text == "Head|Nested|Tail"
def test_partition_html_links():
html_text = """<html>
<a href="/loner">A lone link!</a>
<p>Hello <a href="/link">link!</a></p>
<p>\n Hello <a href="/link">link!</a></p>
<p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
</html>"""
expected_results = [
[
{"text": "A lone link!", "url": "/loner", "start_index": -1},
],
[
{"text": "link!", "url": "/link", "start_index": 6},
],
[
{"text": "link!", "url": "/link", "start_index": 6},
],
[
{"text": "Parrots", "url": "/wiki/parrots", "start_index": 0},
{"text": "Dogs", "url": "/wiki/dogs", "start_index": 12},
],
]
elements = partition_html(text=html_text)
for el_idx, el in enumerate(elements):
expected_result = expected_results[el_idx]
for link_idx, (text, url, start_index) in enumerate(
zip(el.metadata.link_texts, el.metadata.link_urls, el.metadata.link_start_indexes)
):
assert text == expected_result[link_idx]["text"]
assert url == expected_result[link_idx]["url"]
assert start_index == expected_result[link_idx]["start_index"]

View File

@ -82,6 +82,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Weather.gov"
],
@ -110,6 +113,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"News Around NOAA"
],
@ -138,6 +144,29 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
14,
25,
38,
42,
52,
59,
65,
68,
72,
83,
100,
112,
124,
137,
164,
190,
197,
204,
212,
216
],
"link_texts": [
"Weather Safety",
"Air Quality",
@ -206,6 +235,15 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
16,
41,
57,
79,
104,
128
],
"link_texts": [
"Safety Campaigns",
"Seasonal Safety Campaigns",
@ -246,6 +284,21 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
10,
31,
51,
76,
89,
94,
117,
133,
141,
149,
169,
177
],
"link_texts": [
"Ambassador",
"About WRN Ambassadors",
@ -298,6 +351,19 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
9,
27,
47,
61,
86,
104,
118,
127,
150,
165
],
"link_texts": [
"Education",
"NWS Education Home",
@ -346,6 +412,18 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
13,
26,
38,
55,
75,
85,
97,
125,
168
],
"link_texts": [
"Collaboration",
"Get Involved ",
@ -392,6 +470,13 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
14,
25,
33,
53
],
"link_texts": [
" News & Events",
"Latest News",
@ -428,6 +513,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
0
],
"link_texts": [
"International"
],
@ -456,6 +544,19 @@
"languages": [
"eng"
],
"link_start_indexes": [
0,
5,
15,
28,
36,
48,
69,
82,
89,
103,
120
],
"link_texts": [
"About",
"Contact Us",
@ -604,6 +705,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
167
],
"link_texts": [
"Spring Safety website"
],
@ -632,6 +736,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
303
],
"link_texts": [
"infographics"
],
@ -682,6 +789,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"US Dept of Commerce"
],
@ -710,6 +820,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"National Oceanic and Atmospheric Administration"
],
@ -738,6 +851,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"National Weather Service"
],
@ -832,6 +948,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Comments? Questions? Please Contact Us."
],
@ -860,6 +979,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Disclaimer"
],
@ -888,6 +1010,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Information Quality"
],
@ -916,6 +1041,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Help"
],
@ -944,6 +1072,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Glossary"
],
@ -972,6 +1103,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Privacy Policy"
],
@ -1000,6 +1134,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Freedom of Information Act (FOIA)"
],
@ -1028,6 +1165,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"About Us"
],
@ -1056,6 +1196,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"Career Opportunities"
],

View File

@ -122,6 +122,11 @@
"languages": [
"eng"
],
"link_start_indexes": [
43,
49,
68
],
"link_texts": [
"OKRs",
"project plans",
@ -248,6 +253,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
186
],
"link_texts": [
null
],
@ -342,6 +350,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
0
],
"link_texts": [
null
],
@ -630,6 +641,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
4
],
"link_texts": [
"user profiles"
],
@ -686,6 +700,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
27
],
"link_texts": [
"blog posts"
],
@ -742,6 +759,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
53
],
"link_texts": [
"content report table"
],

View File

@ -60,6 +60,11 @@
"languages": [
"eng"
],
"link_start_indexes": [
32,
55,
79
],
"link_texts": [
"Template - Project plan",
"Template - Meeting notes",
@ -92,6 +97,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
10
],
"link_texts": [
"Get the most out of your team space"
],
@ -586,6 +594,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
0
],
"link_texts": [
null
],

View File

@ -60,6 +60,11 @@
"languages": [
"eng"
],
"link_start_indexes": [
32,
55,
79
],
"link_texts": [
"Template - Project plan",
"Template - Meeting notes",
@ -92,6 +97,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
10
],
"link_texts": [
"Get the most out of your team space"
],
@ -586,6 +594,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
0
],
"link_texts": [
null
],

View File

@ -454,6 +454,9 @@
"eng",
"fra"
],
"link_start_indexes": [
0
],
"link_texts": [
"This is the link for unstructured . io."
],

View File

@ -454,6 +454,9 @@
"eng",
"fra"
],
"link_start_indexes": [
0
],
"link_texts": [
"This is the link for unstructured . io."
],

View File

@ -36,6 +36,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
27
],
"link_texts": [
"Github Project Page"
],

View File

@ -320,6 +320,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
29
],
"link_texts": [
"\n Notion Editor 101\n "
],
@ -342,6 +345,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n Notion Editor 101\n "
],

View File

@ -80,6 +80,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
85
],
"link_texts": [
"\n text\n "
],
@ -102,6 +105,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n text\n "
],
@ -381,6 +387,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n https://www.notion.so/icons/airplane_brown.svg\n "
],
@ -403,6 +412,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n https://www.wikipedia.org/\n "
],
@ -425,6 +437,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk\n "
],
@ -463,6 +478,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n Analytics\n "
],
@ -501,6 +519,9 @@
"languages": [
"eng"
],
"link_start_indexes": [
-1
],
"link_texts": [
"\n Untitled\n "
],

View File

@ -1,3 +1,3 @@
metric average sample_sd population_sd count
cct-accuracy 0.809 0.24 0.233 17
cct-%missing 0.025 0.032 0.031 17
cct-accuracy 0.811 0.239 0.232 17
cct-%missing 0.024 0.032 0.031 17

1 metric average sample_sd population_sd count
2 cct-accuracy 0.809 0.811 0.24 0.239 0.233 0.232 17
3 cct-%missing 0.025 0.024 0.032 0.031 17

View File

@ -5,7 +5,7 @@ stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.005
IRS-form-1987.pdf pdf azure 0.794 0.135
spring-weather.html html azure 0.0 0.018
example-10k.html html local 0.727 0.037
example-10k.html html local 0.754 0.027
fake-html-cp1252.html html local 0.659 0.0
ideas-page.html html local 0.93 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0

1 filename doctype connector cct-accuracy cct-%missing
5 Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.005
6 IRS-form-1987.pdf pdf azure 0.794 0.135
7 spring-weather.html html azure 0.0 0.018
8 example-10k.html html local 0.727 0.754 0.037 0.027
9 fake-html-cp1252.html html local 0.659 0.0
10 ideas-page.html html local 0.93 0.033
11 UDHR_first_article_all.txt txt local-single-file 0.995 0.0

View File

@ -188,6 +188,7 @@ class ElementMetadata:
last_modified: Optional[str]
link_texts: Optional[list[str]]
link_urls: Optional[list[str]]
link_start_indexes: Optional[list[int]]
links: Optional[list[Link]]
# -- used in chunks only, allowing access to element(s) chunk was formed from when enabled --
orig_elements: Optional[list[Element]]
@ -235,6 +236,7 @@ class ElementMetadata:
last_modified: Optional[str] = None,
link_texts: Optional[list[str]] = None,
link_urls: Optional[list[str]] = None,
link_start_indexes: Optional[list[int]] = None,
links: Optional[list[Link]] = None,
orig_elements: Optional[list[Element]] = None,
page_name: Optional[str] = None,
@ -274,6 +276,7 @@ class ElementMetadata:
self.last_modified = last_modified
self.link_texts = link_texts
self.link_urls = link_urls
self.link_start_indexes = link_start_indexes
self.links = links
self.orig_elements = orig_elements
self.page_name = page_name
@ -485,6 +488,7 @@ class ConsolidationStrategy(enum.Enum):
"last_modified": cls.FIRST,
"link_texts": cls.LIST_CONCATENATE,
"link_urls": cls.LIST_CONCATENATE,
"link_start_indexes": cls.DROP,
"links": cls.DROP, # -- deprecated field --
"max_characters": cls.DROP, # -- unused, remove from ElementMetadata --
"orig_elements": cls.DROP, # -- not expected, added by chunking, not before --

View File

@ -14,7 +14,10 @@ else:
from lxml import etree
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
from unstructured.cleaners.core import (
clean_bullets,
replace_unicode_quotes,
)
from unstructured.documents.base import Page
from unstructured.documents.elements import (
Address,
@ -296,14 +299,22 @@ class HTMLDocument(XMLDocument):
def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
"""Hyperlinks within and below `tag_elem`."""
links: List[Link] = []
href = tag_elem.get("href")
# TODO(klaijan) - add html href start_index
if href:
links.append({"text": tag_elem.text, "url": href, "start_index": -1})
for tag in tag_elem.iterdescendants():
href = tag.get("href")
if href:
links.append({"text": tag.text, "url": href, "start_index": -1})
tag_elem_href = tag_elem.get("href")
if tag_elem_href:
tag_elem_text = _construct_text(tag_elem, False)
links.append({"text": tag_elem_text, "url": tag_elem_href, "start_index": -1})
else:
start_index = len(tag_elem.text.lstrip()) if tag_elem.text else 0
for tag in tag_elem.iterdescendants():
href = tag.get("href")
if href:
links.append({"text": tag.text, "url": href, "start_index": start_index})
if tag.text and not (tag.text.isspace()):
start_index = start_index + len(tag.text)
if tag.tail and not (tag.tail.isspace()):
start_index = start_index + len(tag.tail)
return links

View File

@ -294,6 +294,7 @@ def add_element_metadata(
links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
link_urls = [link.get("url") for link in links] if links else None
link_texts = [link.get("text") for link in links] if links else None
link_start_indexes = [link.get("start_index") for link in links] if links else None
emphasized_texts = (
element.emphasized_texts
if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
@ -320,6 +321,7 @@ def add_element_metadata(
text_as_html=text_as_html,
link_urls=link_urls,
link_texts=link_texts,
link_start_indexes=link_start_indexes,
emphasized_text_contents=emphasized_text_contents,
emphasized_text_tags=emphasized_text_tags,
section=section,