add support for start_index in html links extraction (#2600)

add support for start_index in html links extraction (closes #2625) Testing ``` from unstructured.partition.html import partition_html from unstructured.staging.base import elements_to_json html_text = """<html> <p>Hello there I am a <a href="/link">very important link!</a></p> <p>Here is a list of my favorite things</p> <ul> <li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li> <li>Dogs</li> </ul> <a href="/loner">A lone link!</a> </html>""" elements = partition_html(text=html_text) print(elements_to_json(elements)) ``` --------- Co-authored-by: Michael Niestroj <michael.niestroj@unblu.com> Co-authored-by: christinestraub <christinemstraub@gmail.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com> Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com>
2025-12-26 14:45:31 +00:00 · 2024-04-12 08:14:20 +02:00 · 2024-04-12 08:14:20 +02:00 · 0506aff788
commit 0506aff788
parent 3e643c4cb3
17 changed files with 330 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,7 @@

 ### Enhancements

+* **Add support for `start_index` in `html` links extraction**
 * **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
 * **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.

--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -353,6 +353,49 @@ def test_get_emphasized_texts_from_tag(doc: str, root: str, expected: List[Dict[
    assert emphasized_texts == expected


+@pytest.mark.parametrize(
+    ("doc", "root", "expected"),
+    [
+        (
+            "<a href='/loner'>A lone link!</a>",
+            "a",
+            [{"text": "A lone link!", "url": "/loner", "start_index": -1}],
+        ),
+        (
+            "<ul><li><a href='/wiki/Parrot'>Parrots</a></li><li>Dogs</li></ul>",
+            "ul",
+            [{"text": "Parrots", "url": "/wiki/Parrot", "start_index": 0}],
+        ),
+        (
+            "<ul><li><a href='/parrot'>Parrots</a></li><li><a href='/dog'>Dogs</a></li></ul>",
+            "ul",
+            [
+                {"text": "Parrots", "url": "/parrot", "start_index": 0},
+                {"text": "Dogs", "url": "/dog", "start_index": 7},
+            ],
+        ),
+        (
+            "<div>Here is <p>P tag</p> tail text. <a href='/link'>link!</a></div>",
+            "div",
+            [{"text": "link!", "url": "/link", "start_index": 25}],
+        ),
+        (
+            "<div>Here is <p>P tag</p><a href='/link'>link!</a></div>",
+            "div",
+            [{"text": "link!", "url": "/link", "start_index": 13}],
+        ),
+    ],
+)
+def test_get_links_from_tag(doc: str, root: str, expected: List[Dict[str, str]]):
+    document_tree = etree.fromstring(doc, etree.HTMLParser())
+    el = document_tree.find(f".//{root}")
+    assert el is not None
+
+    links = html._get_links_from_tag(el)
+
+    assert links == expected
+
+
 def test_parse_nothing():
    doc = """<p></p>"""
    document_tree = etree.fromstring(doc, etree.HTMLParser())
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@ -769,3 +769,39 @@ def test_partition_html_tag_tail_parsing():
    element_text = "|".join([str(el).strip() for el in elements])

    assert element_text == "Head|Nested|Tail"
+
+
+def test_partition_html_links():
+    html_text = """<html>
+        <a href="/loner">A lone link!</a>
+        <p>Hello <a href="/link">link!</a></p>
+        <p>\n    Hello <a href="/link">link!</a></p>
+        <p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
+    </html>"""
+
+    expected_results = [
+        [
+            {"text": "A lone link!", "url": "/loner", "start_index": -1},
+        ],
+        [
+            {"text": "link!", "url": "/link", "start_index": 6},
+        ],
+        [
+            {"text": "link!", "url": "/link", "start_index": 6},
+        ],
+        [
+            {"text": "Parrots", "url": "/wiki/parrots", "start_index": 0},
+            {"text": "Dogs", "url": "/wiki/dogs", "start_index": 12},
+        ],
+    ]
+
+    elements = partition_html(text=html_text)
+
+    for el_idx, el in enumerate(elements):
+        expected_result = expected_results[el_idx]
+        for link_idx, (text, url, start_index) in enumerate(
+            zip(el.metadata.link_texts, el.metadata.link_urls, el.metadata.link_start_indexes)
+        ):
+            assert text == expected_result[link_idx]["text"]
+            assert url == expected_result[link_idx]["url"]
+            assert start_index == expected_result[link_idx]["start_index"]
--- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json
@ -82,6 +82,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Weather.gov"
      ],
@ -110,6 +113,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "News Around NOAA"
      ],
@ -138,6 +144,29 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        14,
+        25,
+        38,
+        42,
+        52,
+        59,
+        65,
+        68,
+        72,
+        83,
+        100,
+        112,
+        124,
+        137,
+        164,
+        190,
+        197,
+        204,
+        212,
+        216
+      ],
      "link_texts": [
        "Weather Safety",
        "Air Quality",
@ -206,6 +235,15 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        16,
+        41,
+        57,
+        79,
+        104,
+        128
+      ],
      "link_texts": [
        "Safety Campaigns",
        "Seasonal Safety Campaigns",
@ -246,6 +284,21 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        10,
+        31,
+        51,
+        76,
+        89,
+        94,
+        117,
+        133,
+        141,
+        149,
+        169,
+        177
+      ],
      "link_texts": [
        "Ambassador",
        "About WRN Ambassadors",
@ -298,6 +351,19 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        9,
+        27,
+        47,
+        61,
+        86,
+        104,
+        118,
+        127,
+        150,
+        165
+      ],
      "link_texts": [
        "Education",
        "NWS Education Home",
@ -346,6 +412,18 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        13,
+        26,
+        38,
+        55,
+        75,
+        85,
+        97,
+        125,
+        168
+      ],
      "link_texts": [
        "Collaboration",
        "Get Involved ",
@ -392,6 +470,13 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        14,
+        25,
+        33,
+        53
+      ],
      "link_texts": [
        " News & Events",
        "Latest News",
@ -428,6 +513,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        "International"
      ],
@ -456,6 +544,19 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0,
+        5,
+        15,
+        28,
+        36,
+        48,
+        69,
+        82,
+        89,
+        103,
+        120
+      ],
      "link_texts": [
        "About",
        "Contact Us",
@ -604,6 +705,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        167
+      ],
      "link_texts": [
        "Spring Safety website"
      ],
@ -632,6 +736,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        303
+      ],
      "link_texts": [
        "infographics"
      ],
@ -682,6 +789,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "US Dept of Commerce"
      ],
@ -710,6 +820,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "National Oceanic and Atmospheric Administration"
      ],
@ -738,6 +851,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "National Weather Service"
      ],
@ -832,6 +948,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Comments? Questions? Please Contact Us."
      ],
@ -860,6 +979,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Disclaimer"
      ],
@ -888,6 +1010,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Information Quality"
      ],
@ -916,6 +1041,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Help"
      ],
@ -944,6 +1072,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Glossary"
      ],
@ -972,6 +1103,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Privacy Policy"
      ],
@ -1000,6 +1134,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Freedom of Information Act (FOIA)"
      ],
@ -1028,6 +1165,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "About Us"
      ],
@ -1056,6 +1196,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "Career Opportunities"
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json
@ -122,6 +122,11 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        43,
+        49,
+        68
+      ],
      "link_texts": [
        "OKRs",
        "project plans",
@ -248,6 +253,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        186
+      ],
      "link_texts": [
        null
      ],
@ -342,6 +350,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        null
      ],
@ -630,6 +641,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        4
+      ],
      "link_texts": [
        "user profiles"
      ],
@ -686,6 +700,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        27
+      ],
      "link_texts": [
        "blog posts"
      ],
@ -742,6 +759,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        53
+      ],
      "link_texts": [
        "content report table"
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json
@ -60,6 +60,11 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        32,
+        55,
+        79
+      ],
      "link_texts": [
        "Template - Project plan",
        "Template - Meeting notes",
@ -92,6 +97,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        10
+      ],
      "link_texts": [
        "Get the most out of your team space"
      ],
@ -586,6 +594,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        null
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json
@ -60,6 +60,11 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        32,
+        55,
+        79
+      ],
      "link_texts": [
        "Template - Project plan",
        "Template - Meeting notes",
@ -92,6 +97,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        10
+      ],
      "link_texts": [
        "Get the most out of your team space"
      ],
@ -586,6 +594,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        null
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json
@ -454,6 +454,9 @@
        "eng",
        "fra"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        "This is the link for unstructured . io."
      ],
--- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json
+++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json
@ -454,6 +454,9 @@
        "eng",
        "fra"
      ],
+      "link_start_indexes": [
+        0
+      ],
      "link_texts": [
        "This is the link for unstructured . io."
      ],
--- a/test_unstructured_ingest/expected-structured-output/github/test.html.json
+++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json
@ -36,6 +36,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        27
+      ],
      "link_texts": [
        "Github Project Page"
      ],
--- a/test_unstructured_ingest/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json
+++ b/test_unstructured_ingest/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json
@ -320,6 +320,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        29
+      ],
      "link_texts": [
        "\n        Notion Editor 101\n      "
      ],
@ -342,6 +345,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        Notion Editor 101\n      "
      ],
--- a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json
+++ b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json
@ -80,6 +80,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        85
+      ],
      "link_texts": [
        "\n        text\n      "
      ],
@ -102,6 +105,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        text\n      "
      ],
@ -381,6 +387,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        https://www.notion.so/icons/airplane_brown.svg\n      "
      ],
@ -403,6 +412,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        https://www.wikipedia.org/\n      "
      ],
@ -425,6 +437,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk\n      "
      ],
@ -463,6 +478,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        Analytics\n      "
      ],
@ -501,6 +519,9 @@
      "languages": [
        "eng"
      ],
+      "link_start_indexes": [
+        -1
+      ],
      "link_texts": [
        "\n        Untitled\n      "
      ],
--- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
@ -1,3 +1,3 @@
 metric	average	sample_sd	population_sd	count
-cct-accuracy	0.809	0.24	0.233	17
-cct-%missing	0.025	0.032	0.031	17
+cct-accuracy	0.811	0.239	0.232	17
+cct-%missing	0.024	0.032	0.031	17
--- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
@ -5,7 +5,7 @@ stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
 Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.005
 IRS-form-1987.pdf	pdf	azure	0.794	0.135
 spring-weather.html	html	azure	0.0	0.018
-example-10k.html	html	local	0.727	0.037
+example-10k.html	html	local	0.754	0.027
 fake-html-cp1252.html	html	local	0.659	0.0
 ideas-page.html	html	local	0.93	0.033
 UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -188,6 +188,7 @@ class ElementMetadata:
    last_modified: Optional[str]
    link_texts: Optional[list[str]]
    link_urls: Optional[list[str]]
+    link_start_indexes: Optional[list[int]]
    links: Optional[list[Link]]
    # -- used in chunks only, allowing access to element(s) chunk was formed from when enabled --
    orig_elements: Optional[list[Element]]
@ -235,6 +236,7 @@ class ElementMetadata:
        last_modified: Optional[str] = None,
        link_texts: Optional[list[str]] = None,
        link_urls: Optional[list[str]] = None,
+        link_start_indexes: Optional[list[int]] = None,
        links: Optional[list[Link]] = None,
        orig_elements: Optional[list[Element]] = None,
        page_name: Optional[str] = None,
@ -274,6 +276,7 @@ class ElementMetadata:
        self.last_modified = last_modified
        self.link_texts = link_texts
        self.link_urls = link_urls
+        self.link_start_indexes = link_start_indexes
        self.links = links
        self.orig_elements = orig_elements
        self.page_name = page_name
@ -485,6 +488,7 @@ class ConsolidationStrategy(enum.Enum):
            "last_modified": cls.FIRST,
            "link_texts": cls.LIST_CONCATENATE,
            "link_urls": cls.LIST_CONCATENATE,
+            "link_start_indexes": cls.DROP,
            "links": cls.DROP,  # -- deprecated field --
            "max_characters": cls.DROP,  # -- unused, remove from ElementMetadata --
            "orig_elements": cls.DROP,  # -- not expected, added by chunking, not before --
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -14,7 +14,10 @@ else:

 from lxml import etree

-from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
+from unstructured.cleaners.core import (
+    clean_bullets,
+    replace_unicode_quotes,
+)
 from unstructured.documents.base import Page
 from unstructured.documents.elements import (
    Address,
@ -296,14 +299,22 @@ class HTMLDocument(XMLDocument):
 def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
    """Hyperlinks within and below `tag_elem`."""
    links: List[Link] = []
-    href = tag_elem.get("href")
-    # TODO(klaijan) - add html href start_index
-    if href:
-        links.append({"text": tag_elem.text, "url": href, "start_index": -1})
-    for tag in tag_elem.iterdescendants():
-        href = tag.get("href")
-        if href:
-            links.append({"text": tag.text, "url": href, "start_index": -1})
+    tag_elem_href = tag_elem.get("href")
+    if tag_elem_href:
+        tag_elem_text = _construct_text(tag_elem, False)
+        links.append({"text": tag_elem_text, "url": tag_elem_href, "start_index": -1})
+    else:
+        start_index = len(tag_elem.text.lstrip()) if tag_elem.text else 0
+        for tag in tag_elem.iterdescendants():
+            href = tag.get("href")
+            if href:
+                links.append({"text": tag.text, "url": href, "start_index": start_index})
+
+            if tag.text and not (tag.text.isspace()):
+                start_index = start_index + len(tag.text)
+            if tag.tail and not (tag.tail.isspace()):
+                start_index = start_index + len(tag.tail)
+
    return links


--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -294,6 +294,7 @@ def add_element_metadata(
    links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
    link_urls = [link.get("url") for link in links] if links else None
    link_texts = [link.get("text") for link in links] if links else None
+    link_start_indexes = [link.get("start_index") for link in links] if links else None
    emphasized_texts = (
        element.emphasized_texts
        if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
@ -320,6 +321,7 @@ def add_element_metadata(
        text_as_html=text_as_html,
        link_urls=link_urls,
        link_texts=link_texts,
+        link_start_indexes=link_start_indexes,
        emphasized_text_contents=emphasized_text_contents,
        emphasized_text_tags=emphasized_text_tags,
        section=section,