mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
add support for start_index in html links extraction (#2600)
add support for start_index in html links extraction (closes #2625) Testing ``` from unstructured.partition.html import partition_html from unstructured.staging.base import elements_to_json html_text = """<html> <p>Hello there I am a <a href="/link">very important link!</a></p> <p>Here is a list of my favorite things</p> <ul> <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li> <li>Dogs</li> </ul> <a href="/loner">A lone link!</a> </html>""" elements = partition_html(text=html_text) print(elements_to_json(elements)) ``` --------- Co-authored-by: Michael Niestroj <michael.niestroj@unblu.com> Co-authored-by: christinestraub <christinemstraub@gmail.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com> Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
This commit is contained in:
parent
3e643c4cb3
commit
0506aff788
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add support for `start_index` in `html` links extraction**
|
||||
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
|
||||
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
|
||||
|
||||
|
||||
@ -353,6 +353,49 @@ def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[
|
||||
assert emphasized_texts == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("doc", "root", "expected"),
|
||||
[
|
||||
(
|
||||
"<a href='/loner'>A lone link!</a>",
|
||||
"a",
|
||||
[{"text": "A lone link!", "url": "/loner", "start_index": -1}],
|
||||
),
|
||||
(
|
||||
"<ul><li><a href='/wiki/Parrot'>Parrots</a></li><li>Dogs</li></ul>",
|
||||
"ul",
|
||||
[{"text": "Parrots", "url": "/wiki/Parrot", "start_index": 0}],
|
||||
),
|
||||
(
|
||||
"<ul><li><a href='/parrot'>Parrots</a></li><li><a href='/dog'>Dogs</a></li></ul>",
|
||||
"ul",
|
||||
[
|
||||
{"text": "Parrots", "url": "/parrot", "start_index": 0},
|
||||
{"text": "Dogs", "url": "/dog", "start_index": 7},
|
||||
],
|
||||
),
|
||||
(
|
||||
"<div>Here is <p>P tag</p> tail text. <a href='/link'>link!</a></div>",
|
||||
"div",
|
||||
[{"text": "link!", "url": "/link", "start_index": 25}],
|
||||
),
|
||||
(
|
||||
"<div>Here is <p>P tag</p><a href='/link'>link!</a></div>",
|
||||
"div",
|
||||
[{"text": "link!", "url": "/link", "start_index": 13}],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_links_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):
|
||||
document_tree = etree.fromstring(doc, etree.HTMLParser())
|
||||
el = document_tree.find(f".//{root}")
|
||||
assert el is not None
|
||||
|
||||
links = html._get_links_from_tag(el)
|
||||
|
||||
assert links == expected
|
||||
|
||||
|
||||
def test_parse_nothing():
|
||||
doc = """<p></p>"""
|
||||
document_tree = etree.fromstring(doc, etree.HTMLParser())
|
||||
|
||||
@ -769,3 +769,39 @@ def test_partition_html_tag_tail_parsing():
|
||||
element_text = "|".join([str(el).strip() for el in elements])
|
||||
|
||||
assert element_text == "Head|Nested|Tail"
|
||||
|
||||
|
||||
def test_partition_html_links():
|
||||
html_text = """<html>
|
||||
<a href="/loner">A lone link!</a>
|
||||
<p>Hello <a href="/link">link!</a></p>
|
||||
<p>\n Hello <a href="/link">link!</a></p>
|
||||
<p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
|
||||
</html>"""
|
||||
|
||||
expected_results = [
|
||||
[
|
||||
{"text": "A lone link!", "url": "/loner", "start_index": -1},
|
||||
],
|
||||
[
|
||||
{"text": "link!", "url": "/link", "start_index": 6},
|
||||
],
|
||||
[
|
||||
{"text": "link!", "url": "/link", "start_index": 6},
|
||||
],
|
||||
[
|
||||
{"text": "Parrots", "url": "/wiki/parrots", "start_index": 0},
|
||||
{"text": "Dogs", "url": "/wiki/dogs", "start_index": 12},
|
||||
],
|
||||
]
|
||||
|
||||
elements = partition_html(text=html_text)
|
||||
|
||||
for el_idx, el in enumerate(elements):
|
||||
expected_result = expected_results[el_idx]
|
||||
for link_idx, (text, url, start_index) in enumerate(
|
||||
zip(el.metadata.link_texts, el.metadata.link_urls, el.metadata.link_start_indexes)
|
||||
):
|
||||
assert text == expected_result[link_idx]["text"]
|
||||
assert url == expected_result[link_idx]["url"]
|
||||
assert start_index == expected_result[link_idx]["start_index"]
|
||||
|
||||
@ -82,6 +82,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Weather.gov"
|
||||
],
|
||||
@ -110,6 +113,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"News Around NOAA"
|
||||
],
|
||||
@ -138,6 +144,29 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
14,
|
||||
25,
|
||||
38,
|
||||
42,
|
||||
52,
|
||||
59,
|
||||
65,
|
||||
68,
|
||||
72,
|
||||
83,
|
||||
100,
|
||||
112,
|
||||
124,
|
||||
137,
|
||||
164,
|
||||
190,
|
||||
197,
|
||||
204,
|
||||
212,
|
||||
216
|
||||
],
|
||||
"link_texts": [
|
||||
"Weather Safety",
|
||||
"Air Quality",
|
||||
@ -206,6 +235,15 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
16,
|
||||
41,
|
||||
57,
|
||||
79,
|
||||
104,
|
||||
128
|
||||
],
|
||||
"link_texts": [
|
||||
"Safety Campaigns",
|
||||
"Seasonal Safety Campaigns",
|
||||
@ -246,6 +284,21 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
10,
|
||||
31,
|
||||
51,
|
||||
76,
|
||||
89,
|
||||
94,
|
||||
117,
|
||||
133,
|
||||
141,
|
||||
149,
|
||||
169,
|
||||
177
|
||||
],
|
||||
"link_texts": [
|
||||
"Ambassador",
|
||||
"About WRN Ambassadors",
|
||||
@ -298,6 +351,19 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
9,
|
||||
27,
|
||||
47,
|
||||
61,
|
||||
86,
|
||||
104,
|
||||
118,
|
||||
127,
|
||||
150,
|
||||
165
|
||||
],
|
||||
"link_texts": [
|
||||
"Education",
|
||||
"NWS Education Home",
|
||||
@ -346,6 +412,18 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
13,
|
||||
26,
|
||||
38,
|
||||
55,
|
||||
75,
|
||||
85,
|
||||
97,
|
||||
125,
|
||||
168
|
||||
],
|
||||
"link_texts": [
|
||||
"Collaboration",
|
||||
"Get Involved ",
|
||||
@ -392,6 +470,13 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
14,
|
||||
25,
|
||||
33,
|
||||
53
|
||||
],
|
||||
"link_texts": [
|
||||
" News & Events",
|
||||
"Latest News",
|
||||
@ -428,6 +513,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
"International"
|
||||
],
|
||||
@ -456,6 +544,19 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0,
|
||||
5,
|
||||
15,
|
||||
28,
|
||||
36,
|
||||
48,
|
||||
69,
|
||||
82,
|
||||
89,
|
||||
103,
|
||||
120
|
||||
],
|
||||
"link_texts": [
|
||||
"About",
|
||||
"Contact Us",
|
||||
@ -604,6 +705,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
167
|
||||
],
|
||||
"link_texts": [
|
||||
"Spring Safety website"
|
||||
],
|
||||
@ -632,6 +736,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
303
|
||||
],
|
||||
"link_texts": [
|
||||
"infographics"
|
||||
],
|
||||
@ -682,6 +789,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"US Dept of Commerce"
|
||||
],
|
||||
@ -710,6 +820,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"National Oceanic and Atmospheric Administration"
|
||||
],
|
||||
@ -738,6 +851,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"National Weather Service"
|
||||
],
|
||||
@ -832,6 +948,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Comments? Questions? Please Contact Us."
|
||||
],
|
||||
@ -860,6 +979,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Disclaimer"
|
||||
],
|
||||
@ -888,6 +1010,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Information Quality"
|
||||
],
|
||||
@ -916,6 +1041,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Help"
|
||||
],
|
||||
@ -944,6 +1072,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Glossary"
|
||||
],
|
||||
@ -972,6 +1103,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Privacy Policy"
|
||||
],
|
||||
@ -1000,6 +1134,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Freedom of Information Act (FOIA)"
|
||||
],
|
||||
@ -1028,6 +1165,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"About Us"
|
||||
],
|
||||
@ -1056,6 +1196,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"Career Opportunities"
|
||||
],
|
||||
|
||||
@ -122,6 +122,11 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
43,
|
||||
49,
|
||||
68
|
||||
],
|
||||
"link_texts": [
|
||||
"OKRs",
|
||||
"project plans",
|
||||
@ -248,6 +253,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
186
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
@ -342,6 +350,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
@ -630,6 +641,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
4
|
||||
],
|
||||
"link_texts": [
|
||||
"user profiles"
|
||||
],
|
||||
@ -686,6 +700,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
27
|
||||
],
|
||||
"link_texts": [
|
||||
"blog posts"
|
||||
],
|
||||
@ -742,6 +759,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
53
|
||||
],
|
||||
"link_texts": [
|
||||
"content report table"
|
||||
],
|
||||
|
||||
@ -60,6 +60,11 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
32,
|
||||
55,
|
||||
79
|
||||
],
|
||||
"link_texts": [
|
||||
"Template - Project plan",
|
||||
"Template - Meeting notes",
|
||||
@ -92,6 +97,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
10
|
||||
],
|
||||
"link_texts": [
|
||||
"Get the most out of your team space"
|
||||
],
|
||||
@ -586,6 +594,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
|
||||
@ -60,6 +60,11 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
32,
|
||||
55,
|
||||
79
|
||||
],
|
||||
"link_texts": [
|
||||
"Template - Project plan",
|
||||
"Template - Meeting notes",
|
||||
@ -92,6 +97,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
10
|
||||
],
|
||||
"link_texts": [
|
||||
"Get the most out of your team space"
|
||||
],
|
||||
@ -586,6 +594,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
null
|
||||
],
|
||||
|
||||
@ -454,6 +454,9 @@
|
||||
"eng",
|
||||
"fra"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
"This is the link for unstructured . io."
|
||||
],
|
||||
|
||||
@ -454,6 +454,9 @@
|
||||
"eng",
|
||||
"fra"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
0
|
||||
],
|
||||
"link_texts": [
|
||||
"This is the link for unstructured . io."
|
||||
],
|
||||
|
||||
@ -36,6 +36,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
27
|
||||
],
|
||||
"link_texts": [
|
||||
"Github Project Page"
|
||||
],
|
||||
|
||||
@ -320,6 +320,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
29
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Notion Editor 101\n "
|
||||
],
|
||||
@ -342,6 +345,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Notion Editor 101\n "
|
||||
],
|
||||
|
||||
@ -80,6 +80,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
85
|
||||
],
|
||||
"link_texts": [
|
||||
"\n text\n "
|
||||
],
|
||||
@ -102,6 +105,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n text\n "
|
||||
],
|
||||
@ -381,6 +387,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n https://www.notion.so/icons/airplane_brown.svg\n "
|
||||
],
|
||||
@ -403,6 +412,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n https://www.wikipedia.org/\n "
|
||||
],
|
||||
@ -425,6 +437,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk\n "
|
||||
],
|
||||
@ -463,6 +478,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Analytics\n "
|
||||
],
|
||||
@ -501,6 +519,9 @@
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"link_start_indexes": [
|
||||
-1
|
||||
],
|
||||
"link_texts": [
|
||||
"\n Untitled\n "
|
||||
],
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
metric average sample_sd population_sd count
|
||||
cct-accuracy 0.809 0.24 0.233 17
|
||||
cct-%missing 0.025 0.032 0.031 17
|
||||
cct-accuracy 0.811 0.239 0.232 17
|
||||
cct-%missing 0.024 0.032 0.031 17
|
||||
|
||||
|
@ -5,7 +5,7 @@ stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
|
||||
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.005
|
||||
IRS-form-1987.pdf pdf azure 0.794 0.135
|
||||
spring-weather.html html azure 0.0 0.018
|
||||
example-10k.html html local 0.727 0.037
|
||||
example-10k.html html local 0.754 0.027
|
||||
fake-html-cp1252.html html local 0.659 0.0
|
||||
ideas-page.html html local 0.93 0.033
|
||||
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
|
||||
|
||||
|
@ -188,6 +188,7 @@ class ElementMetadata:
|
||||
last_modified: Optional[str]
|
||||
link_texts: Optional[list[str]]
|
||||
link_urls: Optional[list[str]]
|
||||
link_start_indexes: Optional[list[int]]
|
||||
links: Optional[list[Link]]
|
||||
# -- used in chunks only, allowing access to element(s) chunk was formed from when enabled --
|
||||
orig_elements: Optional[list[Element]]
|
||||
@ -235,6 +236,7 @@ class ElementMetadata:
|
||||
last_modified: Optional[str] = None,
|
||||
link_texts: Optional[list[str]] = None,
|
||||
link_urls: Optional[list[str]] = None,
|
||||
link_start_indexes: Optional[list[int]] = None,
|
||||
links: Optional[list[Link]] = None,
|
||||
orig_elements: Optional[list[Element]] = None,
|
||||
page_name: Optional[str] = None,
|
||||
@ -274,6 +276,7 @@ class ElementMetadata:
|
||||
self.last_modified = last_modified
|
||||
self.link_texts = link_texts
|
||||
self.link_urls = link_urls
|
||||
self.link_start_indexes = link_start_indexes
|
||||
self.links = links
|
||||
self.orig_elements = orig_elements
|
||||
self.page_name = page_name
|
||||
@ -485,6 +488,7 @@ class ConsolidationStrategy(enum.Enum):
|
||||
"last_modified": cls.FIRST,
|
||||
"link_texts": cls.LIST_CONCATENATE,
|
||||
"link_urls": cls.LIST_CONCATENATE,
|
||||
"link_start_indexes": cls.DROP,
|
||||
"links": cls.DROP, # -- deprecated field --
|
||||
"max_characters": cls.DROP, # -- unused, remove from ElementMetadata --
|
||||
"orig_elements": cls.DROP, # -- not expected, added by chunking, not before --
|
||||
|
||||
@ -14,7 +14,10 @@ else:
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
|
||||
from unstructured.cleaners.core import (
|
||||
clean_bullets,
|
||||
replace_unicode_quotes,
|
||||
)
|
||||
from unstructured.documents.base import Page
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
@ -296,14 +299,22 @@ class HTMLDocument(XMLDocument):
|
||||
def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
|
||||
"""Hyperlinks within and below `tag_elem`."""
|
||||
links: List[Link] = []
|
||||
href = tag_elem.get("href")
|
||||
# TODO(klaijan) - add html href start_index
|
||||
if href:
|
||||
links.append({"text": tag_elem.text, "url": href, "start_index": -1})
|
||||
for tag in tag_elem.iterdescendants():
|
||||
href = tag.get("href")
|
||||
if href:
|
||||
links.append({"text": tag.text, "url": href, "start_index": -1})
|
||||
tag_elem_href = tag_elem.get("href")
|
||||
if tag_elem_href:
|
||||
tag_elem_text = _construct_text(tag_elem, False)
|
||||
links.append({"text": tag_elem_text, "url": tag_elem_href, "start_index": -1})
|
||||
else:
|
||||
start_index = len(tag_elem.text.lstrip()) if tag_elem.text else 0
|
||||
for tag in tag_elem.iterdescendants():
|
||||
href = tag.get("href")
|
||||
if href:
|
||||
links.append({"text": tag.text, "url": href, "start_index": start_index})
|
||||
|
||||
if tag.text and not (tag.text.isspace()):
|
||||
start_index = start_index + len(tag.text)
|
||||
if tag.tail and not (tag.tail.isspace()):
|
||||
start_index = start_index + len(tag.tail)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
|
||||
@ -294,6 +294,7 @@ def add_element_metadata(
|
||||
links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
|
||||
link_urls = [link.get("url") for link in links] if links else None
|
||||
link_texts = [link.get("text") for link in links] if links else None
|
||||
link_start_indexes = [link.get("start_index") for link in links] if links else None
|
||||
emphasized_texts = (
|
||||
element.emphasized_texts
|
||||
if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
|
||||
@ -320,6 +321,7 @@ def add_element_metadata(
|
||||
text_as_html=text_as_html,
|
||||
link_urls=link_urls,
|
||||
link_texts=link_texts,
|
||||
link_start_indexes=link_start_indexes,
|
||||
emphasized_text_contents=emphasized_text_contents,
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
section=section,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user