From 3783b44d0be44fca1335000ee55fa17abf3ad8df Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:33:42 -0600 Subject: [PATCH] fix documentation html links example (#2608) Closes #2577 Testing: ``` from unstructured.partition.html import partition_html cnn_lite_url = "https://lite.cnn.com/" elements = partition_html(url=cnn_lite_url) links = [] for element in elements: if element.metadata.link_urls: relative_link = element.metadata.link_urls[0][1:] if relative_link.startswith("2024"): links.append(f"{cnn_lite_url}{relative_link}") print(links) ``` --------- Co-authored-by: ron-unstructured Co-authored-by: Ronny H <138828701+ron-unstructured@users.noreply.github.com> --- CHANGELOG.md | 2 +- docs/requirements.txt | 2 +- docs/source/examples/chroma.rst | 6 +++--- requirements/build.txt | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55d73a4c2..fd311a5c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ### Features ### Fixes - + * **Fix SharePoint dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. * **Include warnings** about the potential risk of installing a version of `pandoc` which does not support RTF files + instructions that will help resolve that issue. * **Incorporate the `install-pandoc` Makefile recipe** into relevant stages of CI workflow, ensuring it is a version that supports RTF input files. diff --git a/docs/requirements.txt b/docs/requirements.txt index 2aab5838c..3063f1683 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=build.txt build.in # -alabaster==0.7.16 +alabaster==0.7.13 # via sphinx babel==2.14.0 # via sphinx diff --git a/docs/source/examples/chroma.rst b/docs/source/examples/chroma.rst index a6de49cc5..8797c0938 100644 --- a/docs/source/examples/chroma.rst +++ b/docs/source/examples/chroma.rst @@ -20,9 +20,9 @@ First, we gather links from the CNN Lite homepage using the `partition_html` fun links = [] for element in elements: - if element.metadata.links is not None: - relative_link = element.metadata.links[0]["url"][1:] - if relative_link.startswith("2023"): + if element.metadata.link_urls: + relative_link = element.metadata.link_urls[0][1:] + if relative_link.startswith("2024"): links.append(f"{cnn_lite_url}{relative_link}") Ingest Individual Articles with UnstructuredURLLoader diff --git a/requirements/build.txt b/requirements/build.txt index 2aab5838c..3063f1683 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=build.txt build.in # -alabaster==0.7.16 +alabaster==0.7.13 # via sphinx babel==2.14.0 # via sphinx