mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
feat: include hyperlink addresses in DOCXToDocument output (#9109)
* add DOCXLinkFormat * handle page breaks * add sample docx files * make no link extraction the default * reno * docstring and comment
This commit is contained in:
parent
f9cce8bf30
commit
e64db61973
@ -22,7 +22,9 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
|
||||
import docx
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.table import Table
|
||||
from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
from lxml.etree import _Comment
|
||||
|
||||
|
||||
@ -89,6 +91,31 @@ class DOCXTableFormat(Enum):
|
||||
return table_format
|
||||
|
||||
|
||||
class DOCXLinkFormat(Enum):
|
||||
"""
|
||||
Supported formats for storing DOCX link information in a Document.
|
||||
"""
|
||||
|
||||
MARKDOWN = "markdown"
|
||||
PLAIN = "plain"
|
||||
NONE = "none"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
@staticmethod
|
||||
def from_str(string: str) -> "DOCXLinkFormat":
|
||||
"""
|
||||
Convert a string to a DOCXLinkFormat enum.
|
||||
"""
|
||||
enum_map = {e.value: e for e in DOCXLinkFormat}
|
||||
link_format = enum_map.get(string.lower())
|
||||
if link_format is None:
|
||||
msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}"
|
||||
raise ValueError(msg)
|
||||
return link_format
|
||||
|
||||
|
||||
@component
|
||||
class DOCXToDocument:
|
||||
"""
|
||||
@ -99,9 +126,9 @@ class DOCXToDocument:
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat
|
||||
from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
|
||||
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN)
|
||||
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
|
||||
documents = results["documents"]
|
||||
print(documents[0].content)
|
||||
@ -109,18 +136,28 @@ class DOCXToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV,
|
||||
link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE,
|
||||
store_full_path: bool = False,
|
||||
):
|
||||
"""
|
||||
Create a DOCXToDocument component.
|
||||
|
||||
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
|
||||
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
|
||||
DOCXTableFormat.CSV, "markdown", or "csv".
|
||||
:param link_format: The format for link output. Can be either:
|
||||
DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address),
|
||||
DOCXLinkFormat.PLAIN or "plain" to get text (address),
|
||||
DOCXLinkFormat.NONE or "none" to get text without links.
|
||||
:param store_full_path:
|
||||
If True, the full path of the file is stored in the metadata of the document.
|
||||
If False, only the file name is stored.
|
||||
"""
|
||||
docx_import.check()
|
||||
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
|
||||
self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format
|
||||
self.store_full_path = store_full_path
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
@ -130,7 +167,12 @@ class DOCXToDocument:
|
||||
:returns:
|
||||
Dictionary with serialized data.
|
||||
"""
|
||||
return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
|
||||
return default_to_dict(
|
||||
self,
|
||||
table_format=str(self.table_format),
|
||||
link_format=str(self.link_format),
|
||||
store_full_path=self.store_full_path,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
|
||||
@ -144,6 +186,8 @@ class DOCXToDocument:
|
||||
"""
|
||||
if "table_format" in data["init_parameters"]:
|
||||
data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"])
|
||||
if "link_format" in data["init_parameters"]:
|
||||
data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"])
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
@ -218,7 +262,7 @@ class DOCXToDocument:
|
||||
if paragraph.contains_page_break:
|
||||
para_text = self._process_paragraph_with_page_breaks(paragraph)
|
||||
else:
|
||||
para_text = paragraph.text
|
||||
para_text = self._process_links_in_paragraph(paragraph)
|
||||
elements.append(para_text)
|
||||
elif element.tag.endswith("tbl"):
|
||||
table = docx.table.Table(element, document)
|
||||
@ -244,18 +288,42 @@ class DOCXToDocument:
|
||||
# Can only extract text from first paragraph page break, unfortunately
|
||||
if pb_index == 0:
|
||||
if page_break.preceding_paragraph_fragment:
|
||||
para_text += page_break.preceding_paragraph_fragment.text
|
||||
para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment)
|
||||
para_text += "\f"
|
||||
if page_break.following_paragraph_fragment:
|
||||
# following_paragraph_fragment contains all text for remainder of paragraph.
|
||||
# However, if the remainder of the paragraph spans multiple page breaks, it won't include
|
||||
# those later page breaks so we have to add them at end of text in the `else` block below.
|
||||
# This is not ideal, but this case should be very rare and this is likely good enough.
|
||||
para_text += page_break.following_paragraph_fragment.text
|
||||
para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment)
|
||||
else:
|
||||
para_text += "\f"
|
||||
return para_text
|
||||
|
||||
def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str:
|
||||
"""
|
||||
Processes links in a paragraph and formats them according to the specified link format.
|
||||
|
||||
:param paragraph: The DOCX paragraph to process.
|
||||
:returns: A string with links formatted according to the specified format.
|
||||
"""
|
||||
if self.link_format == DOCXLinkFormat.NONE:
|
||||
return paragraph.text
|
||||
text = ""
|
||||
# Iterate over all hyperlinks and other content in the paragraph
|
||||
# https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content
|
||||
for content in paragraph.iter_inner_content():
|
||||
if isinstance(content, Run):
|
||||
text += content.text
|
||||
elif isinstance(content, Hyperlink):
|
||||
if self.link_format == DOCXLinkFormat.MARKDOWN:
|
||||
formatted_link = f"[{content.text}]({content.address})"
|
||||
else: # PLAIN format
|
||||
formatted_link = f"{content.text} ({content.address})"
|
||||
text += formatted_link
|
||||
|
||||
return text
|
||||
|
||||
def _table_to_markdown(self, table: "Table") -> str:
|
||||
"""
|
||||
Converts a DOCX table to a Markdown string.
|
||||
|
||||
5
releasenotes/notes/docx-links-e389808e1ec52d57.yaml
Normal file
5
releasenotes/notes/docx-links-e389808e1ec52d57.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents.
|
||||
It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before.
|
||||
@ -6,7 +6,7 @@ import csv
|
||||
from io import StringIO
|
||||
|
||||
from haystack import Document, Pipeline
|
||||
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
|
||||
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat
|
||||
from haystack.dataclasses import ByteStream
|
||||
|
||||
|
||||
@ -33,36 +33,36 @@ class TestDOCXToDocument:
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"},
|
||||
}
|
||||
|
||||
def test_to_dict_custom_parameters(self):
|
||||
converter = DOCXToDocument(table_format="markdown")
|
||||
converter = DOCXToDocument(table_format="markdown", link_format="markdown")
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
|
||||
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format="csv")
|
||||
converter = DOCXToDocument(table_format="csv", link_format="plain")
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN)
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
|
||||
"init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"},
|
||||
}
|
||||
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
|
||||
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN)
|
||||
data = converter.to_dict()
|
||||
assert data == {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
||||
"init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"},
|
||||
}
|
||||
|
||||
def test_from_dict(self):
|
||||
@ -76,10 +76,11 @@ class TestDOCXToDocument:
|
||||
def test_from_dict_custom_parameters(self):
|
||||
data = {
|
||||
"type": "haystack.components.converters.docx.DOCXToDocument",
|
||||
"init_parameters": {"table_format": "markdown"},
|
||||
"init_parameters": {"table_format": "markdown", "link_format": "markdown"},
|
||||
}
|
||||
converter = DOCXToDocument.from_dict(data)
|
||||
assert converter.table_format == DOCXTableFormat.MARKDOWN
|
||||
assert converter.link_format == DOCXLinkFormat.MARKDOWN
|
||||
|
||||
def test_from_dict_invalid_table_format(self):
|
||||
data = {
|
||||
@ -397,3 +398,53 @@ class TestDOCXToDocument:
|
||||
# check it is JSON serializable
|
||||
json_str = json.dumps(doc.to_dict(flatten=False))
|
||||
assert json.loads(json_str) == doc.to_dict(flatten=False)
|
||||
|
||||
def test_link_format_initialization(self):
|
||||
converter = DOCXToDocument(link_format="markdown")
|
||||
assert converter.link_format == DOCXLinkFormat.MARKDOWN
|
||||
|
||||
converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN)
|
||||
assert converter.link_format == DOCXLinkFormat.PLAIN
|
||||
|
||||
def test_link_format_invalid(self):
|
||||
with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"):
|
||||
DOCXToDocument(link_format="invalid_format")
|
||||
|
||||
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
|
||||
def test_link_extraction(self, test_files_path, link_format):
|
||||
docx_converter = DOCXToDocument(link_format=link_format)
|
||||
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
|
||||
output = docx_converter.run(sources=paths)
|
||||
content = output["documents"][0].content
|
||||
|
||||
if link_format == "markdown":
|
||||
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
|
||||
else: # plain format
|
||||
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
|
||||
|
||||
@pytest.mark.parametrize("link_format", ["markdown", "plain"])
|
||||
def test_link_extraction_page_break(self, test_files_path, link_format):
|
||||
docx_converter = DOCXToDocument(link_format=link_format)
|
||||
paths = [test_files_path / "docx" / "sample_docx_with_links.docx"]
|
||||
output = docx_converter.run(sources=paths)
|
||||
content = output["documents"][0].content
|
||||
|
||||
if link_format == "markdown":
|
||||
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content
|
||||
assert "[of](https://en.wikipedia.org/wiki/OF)" in content
|
||||
assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content
|
||||
assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
|
||||
else: # plain format
|
||||
assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content
|
||||
assert "of (https://en.wikipedia.org/wiki/OF)" in content
|
||||
assert "charge (https://en.wikipedia.org/wiki/Charge)" in content
|
||||
assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content
|
||||
|
||||
def test_no_link_extraction(self, test_files_path):
|
||||
docx_converter = DOCXToDocument()
|
||||
paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"]
|
||||
output = docx_converter.run(sources=paths)
|
||||
content = output["documents"][0].content
|
||||
|
||||
assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content
|
||||
assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content
|
||||
|
||||
BIN
test/test_files/docx/sample_docx_with_links.docx
Normal file
BIN
test/test_files/docx/sample_docx_with_links.docx
Normal file
Binary file not shown.
BIN
test/test_files/docx/sample_docx_with_single_link.docx
Normal file
BIN
test/test_files/docx/sample_docx_with_single_link.docx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user