feat: include hyperlink addresses in DOCXToDocument output (#9109)

* add DOCXLinkFormat * handle page breaks * add sample docx files * make no link extraction the default * reno * docstring and comment
2026-01-06 03:57:19 +00:00 · 2025-03-25 14:33:18 +01:00 · 2025-03-25 14:33:18 +01:00 · e64db61973
commit e64db61973
parent f9cce8bf30
5 changed files with 143 additions and 19 deletions
--- a/haystack/components/converters/docx.py
+++ b/haystack/components/converters/docx.py
@ -22,7 +22,9 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
    import docx
    from docx.document import Document as DocxDocument
    from docx.table import Table
+    from docx.text.hyperlink import Hyperlink
    from docx.text.paragraph import Paragraph
+    from docx.text.run import Run
    from lxml.etree import _Comment


@ -89,6 +91,31 @@ class DOCXTableFormat(Enum):
        return table_format


+class DOCXLinkFormat(Enum):
+    """
+    Supported formats for storing DOCX link information in a Document.
+    """
+
+    MARKDOWN = "markdown"
+    PLAIN = "plain"
+    NONE = "none"
+
+    def __str__(self):
+        return self.value
+
+    @staticmethod
+    def from_str(string: str) -> "DOCXLinkFormat":
+        """
+        Convert a string to a DOCXLinkFormat enum.
+        """
+        enum_map = {e.value: e for e in DOCXLinkFormat}
+        link_format = enum_map.get(string.lower())
+        if link_format is None:
+            msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}"
+            raise ValueError(msg)
+        return link_format
+
+
@component
 class DOCXToDocument:
    """
@ -99,9 +126,9 @@ class DOCXToDocument:

    Usage example:
    ```python
-    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
+    from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat

-    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
+    converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN)
    results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
    documents = results["documents"]
    print(documents[0].content)
@ -109,18 +136,28 @@ class DOCXToDocument:
    ```
    """

-    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
+    def __init__(
+        self,
+        table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV,
+        link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE,
+        store_full_path: bool = False,
+    ):
        """
        Create a DOCXToDocument component.

        :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
-            DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
+            DOCXTableFormat.CSV, "markdown", or "csv".
+        :param link_format: The format for link output. Can be either:
+            DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
+            DOCXLinkFormat.PLAIN or "plain" to get text (address),
+            DOCXLinkFormat.NONE or "none" to get text without links.
        :param store_full_path:
            If True, the full path of the file is stored in the metadata of the document.
            If False, only the file name is stored.
        """
        docx_import.check()
        self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+        self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format
        self.store_full_path = store_full_path

    def to_dict(self) -> Dict[str, Any]:
@ -130,7 +167,12 @@ class DOCXToDocument:
        :returns:
            Dictionary with serialized data.
        """
-        return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
+        return default_to_dict(
+            self,
+            table_format=str(self.table_format),
+            link_format=str(self.link_format),
+            store_full_path=self.store_full_path,
+        )

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@ -144,6 +186,8 @@ class DOCXToDocument:
        """
        if "table_format" in data["init_parameters"]:
            data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
+        if "link_format" in data["init_parameters"]:
+            data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"])
        return default_from_dict(cls, data)

    @component.output_types(documents=List[Document])
@ -218,7 +262,7 @@ class DOCXToDocument:
                if paragraph.contains_page_break:
                    para_text = self._process_paragraph_with_page_breaks(paragraph)
                else:
-                    para_text = paragraph.text
+                    para_text = self._process_links_in_paragraph(paragraph)
                elements.append(para_text)
            elif element.tag.endswith("tbl"):
                table = docx.table.Table(element, document)
@ -244,18 +288,42 @@ class DOCXToDocument:
            # Can only extract text from first paragraph page break, unfortunately
            if pb_index == 0:
                if page_break.preceding_paragraph_fragment:
-                    para_text += page_break.preceding_paragraph_fragment.text
+                    para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment)
                para_text += "\f"
                if page_break.following_paragraph_fragment:
                    # following_paragraph_fragment contains all text for remainder of paragraph.
                    # However, if the remainder of the paragraph spans multiple page breaks, it won't include
                    # those later page breaks so we have to add them at end of text in the `else` block below.
                    # This is not ideal, but this case should be very rare and this is likely good enough.
-                    para_text += page_break.following_paragraph_fragment.text
+                    para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment)
            else:
                para_text += "\f"
        return para_text

+    def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str:
+        """
+        Processes links in a paragraph and formats them according to the specified link format.
+
+        :param paragraph: The DOCX paragraph to process.
+        :returns: A string with links formatted according to the specified format.
+        """
+        if self.link_format == DOCXLinkFormat.NONE:
+            return paragraph.text
+        text = ""
+        # Iterate over all hyperlinks and other content in the paragraph
+        # https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
+        for content in paragraph.iter_inner_content():
+            if isinstance(content, Run):
+                text += content.text
+            elif isinstance(content, Hyperlink):
+                if self.link_format == DOCXLinkFormat.MARKDOWN:
+                    formatted_link = f"[{content.text}]({content.address})"
+                else:  # PLAIN format
+                    formatted_link = f"{content.text} ({content.address})"
+                text += formatted_link
+
+        return text
+
    def _table_to_markdown(self, table: "Table") -> str:
        """
        Converts a DOCX table to a Markdown string.
--- a/releasenotes/notes/docx-links-e389808e1ec52d57.yaml
+++ b/releasenotes/notes/docx-links-e389808e1ec52d57.yaml
@ -0,0 +1,5 @@
+---
+features:
+  - |
+    The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents.
+    It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before.
--- a/test/components/converters/test_docx_file_to_document.py
+++ b/test/components/converters/test_docx_file_to_document.py
@ -6,7 +6,7 @@ import csv
 from io import StringIO

 from haystack import Document, Pipeline
-from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
+from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
 from haystack.dataclasses import ByteStream


@ -33,36 +33,36 @@ class TestDOCXToDocument:
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"},
        }

    def test_to_dict_custom_parameters(self):
-        converter = DOCXToDocument(table_format="markdown")
+        converter = DOCXToDocument(table_format="markdown", link_format="markdown")
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
        }

-        converter = DOCXToDocument(table_format="csv")
+        converter = DOCXToDocument(table_format="csv", link_format="plain")
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
        }

-        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
+        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
+            "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
        }

-        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
+        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"store_full_path": False, "table_format": "csv"},
+            "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
        }

    def test_from_dict(self):
@ -76,10 +76,11 @@ class TestDOCXToDocument:
    def test_from_dict_custom_parameters(self):
        data = {
            "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"table_format": "markdown", "link_format": "markdown"},
        }
        converter = DOCXToDocument.from_dict(data)
        assert converter.table_format == DOCXTableFormat.MARKDOWN
+        assert converter.link_format == DOCXLinkFormat.MARKDOWN

    def test_from_dict_invalid_table_format(self):
        data = {
@ -397,3 +398,53 @@ class TestDOCXToDocument:
        # check it is JSON serializable
        json_str = json.dumps(doc.to_dict(flatten=False))
        assert json.loads(json_str) == doc.to_dict(flatten=False)
+
+    def test_link_format_initialization(self):
+        converter = DOCXToDocument(link_format="markdown")
+        assert converter.link_format == DOCXLinkFormat.MARKDOWN
+
+        converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN)
+        assert converter.link_format == DOCXLinkFormat.PLAIN
+
+    def test_link_format_invalid(self):
+        with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"):
+            DOCXToDocument(link_format="invalid_format")
+
+    @pytest.mark.parametrize("link_format", ["markdown", "plain"])
+    def test_link_extraction(self, test_files_path, link_format):
+        docx_converter = DOCXToDocument(link_format=link_format)
+        paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        if link_format == "markdown":
+            assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
+        else:  # plain format
+            assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
+
+    @pytest.mark.parametrize("link_format", ["markdown", "plain"])
+    def test_link_extraction_page_break(self, test_files_path, link_format):
+        docx_converter = DOCXToDocument(link_format=link_format)
+        paths = [test_files_path / "docx" / "sample_docx_with_links.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        if link_format == "markdown":
+            assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
+            assert "[of](https://en.wikipedia.org/wiki/OF)" in content
+            assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content
+            assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
+        else:  # plain format
+            assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
+            assert "of (https://en.wikipedia.org/wiki/OF)" in content
+            assert "charge (https://en.wikipedia.org/wiki/Charge)" in content
+            assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
+
+    def test_no_link_extraction(self, test_files_path):
+        docx_converter = DOCXToDocument()
+        paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
+        output = docx_converter.run(sources=paths)
+        content = output["documents"][0].content
+
+        assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content
+        assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content
--- a/test/test_files/docx/sample_docx_with_links.docx
+++ b/test/test_files/docx/sample_docx_with_links.docx
--- a/test/test_files/docx/sample_docx_with_single_link.docx
+++ b/test/test_files/docx/sample_docx_with_single_link.docx