feat: include hyperlink addresses in DOCXToDocument output (#9109)

* add DOCXLinkFormat

* handle page breaks

* add sample docx files

* make no link extraction the default

* reno

* docstring and comment
This commit is contained in:
Julian Risch 2025-03-25 14:33:18 +01:00 committed by GitHub
parent f9cce8bf30
commit e64db61973
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 143 additions and 19 deletions

View File

@ -22,7 +22,9 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
import docx
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml.etree import _Comment
@ -89,6 +91,31 @@ class DOCXTableFormat(Enum):
return table_format
class DOCXLinkFormat(Enum):
"""
Supported formats for storing DOCX link information in a Document.
"""
MARKDOWN = "markdown"
PLAIN = "plain"
NONE = "none"
def __str__(self):
return self.value
@staticmethod
def from_str(string: str) -> "DOCXLinkFormat":
"""
Convert a string to a DOCXLinkFormat enum.
"""
enum_map = {e.value: e for e in DOCXLinkFormat}
link_format = enum_map.get(string.lower())
if link_format is None:
msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}"
raise ValueError(msg)
return link_format
@component
class DOCXToDocument:
"""
@ -99,9 +126,9 @@ class DOCXToDocument:
Usage example:
```python
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN)
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
@ -109,18 +136,28 @@ class DOCXToDocument:
```
"""
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
def __init__(
self,
table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV,
link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE,
store_full_path: bool = False,
):
"""
Create a DOCXToDocument component.
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
DOCXTableFormat.CSV, "markdown", or "csv".
:param link_format: The format for link output. Can be either:
DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
DOCXLinkFormat.PLAIN or "plain" to get text (address),
DOCXLinkFormat.NONE or "none" to get text without links.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
docx_import.check()
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format
self.store_full_path = store_full_path
def to_dict(self) -> Dict[str, Any]:
@ -130,7 +167,12 @@ class DOCXToDocument:
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
return default_to_dict(
self,
table_format=str(self.table_format),
link_format=str(self.link_format),
store_full_path=self.store_full_path,
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@ -144,6 +186,8 @@ class DOCXToDocument:
"""
if "table_format" in data["init_parameters"]:
data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
if "link_format" in data["init_parameters"]:
data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"])
return default_from_dict(cls, data)
@component.output_types(documents=List[Document])
@ -218,7 +262,7 @@ class DOCXToDocument:
if paragraph.contains_page_break:
para_text = self._process_paragraph_with_page_breaks(paragraph)
else:
para_text = paragraph.text
para_text = self._process_links_in_paragraph(paragraph)
elements.append(para_text)
elif element.tag.endswith("tbl"):
table = docx.table.Table(element, document)
@ -244,18 +288,42 @@ class DOCXToDocument:
# Can only extract text from first paragraph page break, unfortunately
if pb_index == 0:
if page_break.preceding_paragraph_fragment:
para_text += page_break.preceding_paragraph_fragment.text
para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment)
para_text += "\f"
if page_break.following_paragraph_fragment:
# following_paragraph_fragment contains all text for remainder of paragraph.
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
# those later page breaks so we have to add them at end of text in the `else` block below.
# This is not ideal, but this case should be very rare and this is likely good enough.
para_text += page_break.following_paragraph_fragment.text
para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment)
else:
para_text += "\f"
return para_text
def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str:
"""
Processes links in a paragraph and formats them according to the specified link format.
:param paragraph: The DOCX paragraph to process.
:returns: A string with links formatted according to the specified format.
"""
if self.link_format == DOCXLinkFormat.NONE:
return paragraph.text
text = ""
# Iterate over all hyperlinks and other content in the paragraph
# https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
for content in paragraph.iter_inner_content():
if isinstance(content, Run):
text += content.text
elif isinstance(content, Hyperlink):
if self.link_format == DOCXLinkFormat.MARKDOWN:
formatted_link = f"[{content.text}]({content.address})"
else: # PLAIN format
formatted_link = f"{content.text} ({content.address})"
text += formatted_link
return text
def _table_to_markdown(self, table: "Table") -> str:
"""
Converts a DOCX table to a Markdown string.

View File

@ -0,0 +1,5 @@
---
features:
- |
The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents.
It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before.

View File

@ -6,7 +6,7 @@ import csv
from io import StringIO
from haystack import Document, Pipeline
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
from haystack.dataclasses import ByteStream
@ -33,36 +33,36 @@ class TestDOCXToDocument:
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"},
}
def test_to_dict_custom_parameters(self):
converter = DOCXToDocument(table_format="markdown")
converter = DOCXToDocument(table_format="markdown", link_format="markdown")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
}
converter = DOCXToDocument(table_format="csv")
converter = DOCXToDocument(table_format="csv", link_format="plain")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
}
def test_from_dict(self):
@ -76,10 +76,11 @@ class TestDOCXToDocument:
def test_from_dict_custom_parameters(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "markdown"},
"init_parameters": {"table_format": "markdown", "link_format": "markdown"},
}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.MARKDOWN
assert converter.link_format == DOCXLinkFormat.MARKDOWN
def test_from_dict_invalid_table_format(self):
data = {
@ -397,3 +398,53 @@ class TestDOCXToDocument:
# check it is JSON serializable
json_str = json.dumps(doc.to_dict(flatten=False))
assert json.loads(json_str) == doc.to_dict(flatten=False)
def test_link_format_initialization(self):
converter = DOCXToDocument(link_format="markdown")
assert converter.link_format == DOCXLinkFormat.MARKDOWN
converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN)
assert converter.link_format == DOCXLinkFormat.PLAIN
def test_link_format_invalid(self):
with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"):
DOCXToDocument(link_format="invalid_format")
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
def test_link_extraction(self, test_files_path, link_format):
docx_converter = DOCXToDocument(link_format=link_format)
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
if link_format == "markdown":
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
else: # plain format
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
def test_link_extraction_page_break(self, test_files_path, link_format):
docx_converter = DOCXToDocument(link_format=link_format)
paths = [test_files_path / "docx" / "sample_docx_with_links.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
if link_format == "markdown":
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
assert "[of](https://en.wikipedia.org/wiki/OF)" in content
assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content
assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
else: # plain format
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
assert "of (https://en.wikipedia.org/wiki/OF)" in content
assert "charge (https://en.wikipedia.org/wiki/Charge)" in content
assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
def test_no_link_extraction(self, test_files_path):
docx_converter = DOCXToDocument()
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content
assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content

Binary file not shown.

Binary file not shown.