diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index 3607ae454..8a8947bb2 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -22,7 +22,9 @@ with LazyImport("Run 'pip install python-docx'") as docx_import: import docx from docx.document import Document as DocxDocument from docx.table import Table + from docx.text.hyperlink import Hyperlink from docx.text.paragraph import Paragraph + from docx.text.run import Run from lxml.etree import _Comment @@ -89,6 +91,31 @@ class DOCXTableFormat(Enum): return table_format +class DOCXLinkFormat(Enum): + """ + Supported formats for storing DOCX link information in a Document. + """ + + MARKDOWN = "markdown" + PLAIN = "plain" + NONE = "none" + + def __str__(self): + return self.value + + @staticmethod + def from_str(string: str) -> "DOCXLinkFormat": + """ + Convert a string to a DOCXLinkFormat enum. + """ + enum_map = {e.value: e for e in DOCXLinkFormat} + link_format = enum_map.get(string.lower()) + if link_format is None: + msg = f"Unknown link format '{string}'. Supported formats are: {list(enum_map.keys())}" + raise ValueError(msg) + return link_format + + @component class DOCXToDocument: """ @@ -99,9 +126,9 @@ class DOCXToDocument: Usage example: ```python - from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat + from haystack.components.converters.docx import DOCXToDocument, DOCXTableFormat, DOCXLinkFormat - converter = DOCXToDocument(table_format=DOCXTableFormat.CSV) + converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.MARKDOWN) results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()}) documents = results["documents"] print(documents[0].content) @@ -109,18 +136,28 @@ class DOCXToDocument: ``` """ - def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False): + def __init__( + self, + table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, + link_format: Union[str, DOCXLinkFormat] = DOCXLinkFormat.NONE, + store_full_path: bool = False, + ): """ Create a DOCXToDocument component. :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN, - DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV. + DOCXTableFormat.CSV, "markdown", or "csv". + :param link_format: The format for link output. Can be either: + DOCXLinkFormat.MARKDOWN or "markdown" to get [text](address), + DOCXLinkFormat.PLAIN or "plain" to get text (address), + DOCXLinkFormat.NONE or "none" to get text without links. :param store_full_path: If True, the full path of the file is stored in the metadata of the document. If False, only the file name is stored. """ docx_import.check() self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format + self.link_format = DOCXLinkFormat.from_str(link_format) if isinstance(link_format, str) else link_format self.store_full_path = store_full_path def to_dict(self) -> Dict[str, Any]: @@ -130,7 +167,12 @@ class DOCXToDocument: :returns: Dictionary with serialized data. """ - return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path) + return default_to_dict( + self, + table_format=str(self.table_format), + link_format=str(self.link_format), + store_full_path=self.store_full_path, + ) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument": @@ -144,6 +186,8 @@ class DOCXToDocument: """ if "table_format" in data["init_parameters"]: data["init_parameters"]["table_format"] = DOCXTableFormat.from_str(data["init_parameters"]["table_format"]) + if "link_format" in data["init_parameters"]: + data["init_parameters"]["link_format"] = DOCXLinkFormat.from_str(data["init_parameters"]["link_format"]) return default_from_dict(cls, data) @component.output_types(documents=List[Document]) @@ -218,7 +262,7 @@ class DOCXToDocument: if paragraph.contains_page_break: para_text = self._process_paragraph_with_page_breaks(paragraph) else: - para_text = paragraph.text + para_text = self._process_links_in_paragraph(paragraph) elements.append(para_text) elif element.tag.endswith("tbl"): table = docx.table.Table(element, document) @@ -244,18 +288,42 @@ class DOCXToDocument: # Can only extract text from first paragraph page break, unfortunately if pb_index == 0: if page_break.preceding_paragraph_fragment: - para_text += page_break.preceding_paragraph_fragment.text + para_text += self._process_links_in_paragraph(page_break.preceding_paragraph_fragment) para_text += "\f" if page_break.following_paragraph_fragment: # following_paragraph_fragment contains all text for remainder of paragraph. # However, if the remainder of the paragraph spans multiple page breaks, it won't include # those later page breaks so we have to add them at end of text in the `else` block below. # This is not ideal, but this case should be very rare and this is likely good enough. - para_text += page_break.following_paragraph_fragment.text + para_text += self._process_links_in_paragraph(page_break.following_paragraph_fragment) else: para_text += "\f" return para_text + def _process_links_in_paragraph(self, paragraph: "Paragraph") -> str: + """ + Processes links in a paragraph and formats them according to the specified link format. + + :param paragraph: The DOCX paragraph to process. + :returns: A string with links formatted according to the specified format. + """ + if self.link_format == DOCXLinkFormat.NONE: + return paragraph.text + text = "" + # Iterate over all hyperlinks and other content in the paragraph + # https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.paragraph.Paragraph.iter_inner_content + for content in paragraph.iter_inner_content(): + if isinstance(content, Run): + text += content.text + elif isinstance(content, Hyperlink): + if self.link_format == DOCXLinkFormat.MARKDOWN: + formatted_link = f"[{content.text}]({content.address})" + else: # PLAIN format + formatted_link = f"{content.text} ({content.address})" + text += formatted_link + + return text + def _table_to_markdown(self, table: "Table") -> str: """ Converts a DOCX table to a Markdown string. diff --git a/releasenotes/notes/docx-links-e389808e1ec52d57.yaml b/releasenotes/notes/docx-links-e389808e1ec52d57.yaml new file mode 100644 index 000000000..904b1529e --- /dev/null +++ b/releasenotes/notes/docx-links-e389808e1ec52d57.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + The DOCXToDocument component now has an option to include extracted hyperlink addresses in the output Documents. + It accepts a `link_format` parameter that can be set to "markdown" or "plain". By default, no hyperlink addresses are extracted as before. diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index 2cc99d299..5130ee5ae 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -6,7 +6,7 @@ import csv from io import StringIO from haystack import Document, Pipeline -from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat +from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat, DOCXLinkFormat from haystack.dataclasses import ByteStream @@ -33,36 +33,36 @@ class TestDOCXToDocument: data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": False, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "none"}, } def test_to_dict_custom_parameters(self): - converter = DOCXToDocument(table_format="markdown") + converter = DOCXToDocument(table_format="markdown", link_format="markdown") data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": False, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"}, } - converter = DOCXToDocument(table_format="csv") + converter = DOCXToDocument(table_format="csv", link_format="plain") data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": False, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"}, } - converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN) + converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN, link_format=DOCXLinkFormat.MARKDOWN) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": False, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown", "link_format": "markdown"}, } - converter = DOCXToDocument(table_format=DOCXTableFormat.CSV) + converter = DOCXToDocument(table_format=DOCXTableFormat.CSV, link_format=DOCXLinkFormat.PLAIN) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": False, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv", "link_format": "plain"}, } def test_from_dict(self): @@ -76,10 +76,11 @@ class TestDOCXToDocument: def test_from_dict_custom_parameters(self): data = { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"table_format": "markdown"}, + "init_parameters": {"table_format": "markdown", "link_format": "markdown"}, } converter = DOCXToDocument.from_dict(data) assert converter.table_format == DOCXTableFormat.MARKDOWN + assert converter.link_format == DOCXLinkFormat.MARKDOWN def test_from_dict_invalid_table_format(self): data = { @@ -397,3 +398,53 @@ class TestDOCXToDocument: # check it is JSON serializable json_str = json.dumps(doc.to_dict(flatten=False)) assert json.loads(json_str) == doc.to_dict(flatten=False) + + def test_link_format_initialization(self): + converter = DOCXToDocument(link_format="markdown") + assert converter.link_format == DOCXLinkFormat.MARKDOWN + + converter = DOCXToDocument(link_format=DOCXLinkFormat.PLAIN) + assert converter.link_format == DOCXLinkFormat.PLAIN + + def test_link_format_invalid(self): + with pytest.raises(ValueError, match="Unknown link format 'invalid_format'"): + DOCXToDocument(link_format="invalid_format") + + @pytest.mark.parametrize("link_format", ["markdown", "plain"]) + def test_link_extraction(self, test_files_path, link_format): + docx_converter = DOCXToDocument(link_format=link_format) + paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"] + output = docx_converter.run(sources=paths) + content = output["documents"][0].content + + if link_format == "markdown": + assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content + else: # plain format + assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content + + @pytest.mark.parametrize("link_format", ["markdown", "plain"]) + def test_link_extraction_page_break(self, test_files_path, link_format): + docx_converter = DOCXToDocument(link_format=link_format) + paths = [test_files_path / "docx" / "sample_docx_with_links.docx"] + output = docx_converter.run(sources=paths) + content = output["documents"][0].content + + if link_format == "markdown": + assert "[PDF](https://en.wikipedia.org/wiki/PDF)" in content + assert "[of](https://en.wikipedia.org/wiki/OF)" in content + assert "[charge](https://en.wikipedia.org/wiki/Charge)" in content + assert "[disambiguation link](https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content + else: # plain format + assert "PDF (https://en.wikipedia.org/wiki/PDF)" in content + assert "of (https://en.wikipedia.org/wiki/OF)" in content + assert "charge (https://en.wikipedia.org/wiki/Charge)" in content + assert "disambiguation link (https://en.wikipedia.org/wiki/PDF_(disambiguation))" in content + + def test_no_link_extraction(self, test_files_path): + docx_converter = DOCXToDocument() + paths = [test_files_path / "docx" / "sample_docx_with_single_link.docx"] + output = docx_converter.run(sources=paths) + content = output["documents"][0].content + + assert "[PDF](https://en.wikipedia.org/wiki/PDF)" not in content + assert "PDF (https://en.wikipedia.org/wiki/PDF)" not in content diff --git a/test/test_files/docx/sample_docx_with_links.docx b/test/test_files/docx/sample_docx_with_links.docx new file mode 100644 index 000000000..dff090149 Binary files /dev/null and b/test/test_files/docx/sample_docx_with_links.docx differ diff --git a/test/test_files/docx/sample_docx_with_single_link.docx b/test/test_files/docx/sample_docx_with_single_link.docx new file mode 100644 index 000000000..97f37af81 Binary files /dev/null and b/test/test_files/docx/sample_docx_with_single_link.docx differ