mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-12 00:23:35 +00:00

Page breaks are reliably indicated by `w:lastRenderedPageBreak` elements present in the document XML. Page breaks are NOT reliably indicated by "hard" page-breaks inserted by the author and when present are redundant to a `w:lastRenderedPageBreak` element so cause over-counting if used. Use rendered page-breaks only.
875 lines
37 KiB
Python
875 lines
37 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from __future__ import annotations
|
|
|
|
import html
|
|
import io
|
|
import itertools
|
|
import os
|
|
import tempfile
|
|
from tempfile import SpooledTemporaryFile
|
|
from typing import (
|
|
IO,
|
|
Any,
|
|
Dict,
|
|
Iterator,
|
|
List,
|
|
Optional,
|
|
Tuple,
|
|
Type,
|
|
Union,
|
|
cast,
|
|
)
|
|
|
|
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
|
import docx
|
|
from docx.document import Document
|
|
from docx.enum.section import WD_SECTION_START
|
|
from docx.oxml.table import CT_Tbl
|
|
from docx.oxml.text.paragraph import CT_P
|
|
from docx.section import Section, _Footer, _Header
|
|
from docx.table import Table as DocxTable
|
|
from docx.table import _Cell, _Row
|
|
from docx.text.hyperlink import Hyperlink
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.text.run import Run
|
|
from tabulate import tabulate
|
|
from typing_extensions import TypeAlias
|
|
|
|
from unstructured.chunking.title import add_chunking_strategy
|
|
from unstructured.cleaners.core import clean_bullets
|
|
from unstructured.documents.elements import (
|
|
Address,
|
|
Element,
|
|
ElementMetadata,
|
|
EmailAddress,
|
|
Footer,
|
|
Header,
|
|
Link,
|
|
ListItem,
|
|
NarrativeText,
|
|
PageBreak,
|
|
Table,
|
|
Text,
|
|
Title,
|
|
process_metadata,
|
|
)
|
|
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
|
from unstructured.partition.common import (
|
|
exactly_one,
|
|
get_last_modified_date,
|
|
get_last_modified_date_from_file,
|
|
)
|
|
from unstructured.partition.lang import apply_lang_metadata
|
|
from unstructured.partition.text_type import (
|
|
is_bulleted_text,
|
|
is_email_address,
|
|
is_possible_narrative_text,
|
|
is_possible_title,
|
|
is_us_city_state_zip,
|
|
)
|
|
from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies
|
|
|
|
if dependency_exists("pypandoc"):
|
|
import pypandoc
|
|
|
|
DETECTION_ORIGIN: str = "docx"
|
|
BlockElement: TypeAlias = Union[CT_P, CT_Tbl]
|
|
BlockItem: TypeAlias = Union[Paragraph, DocxTable]
|
|
|
|
|
|
@requires_dependencies("pypandoc")
|
|
def convert_and_partition_docx(
|
|
source_format: str,
|
|
filename: Optional[str] = None,
|
|
file: Optional[IO[bytes]] = None,
|
|
include_metadata: bool = True,
|
|
infer_table_structure: bool = True,
|
|
metadata_filename: Optional[str] = None,
|
|
metadata_last_modified: Optional[str] = None,
|
|
languages: Optional[List[str]] = ["auto"],
|
|
detect_language_per_element: bool = False,
|
|
) -> List[Element]:
|
|
"""Converts a document to DOCX and then partitions it using partition_docx.
|
|
|
|
Works with any file format support by pandoc.
|
|
|
|
Parameters
|
|
----------
|
|
source_format
|
|
The format of the source document, .e.g. odt
|
|
filename
|
|
A string defining the target filename path.
|
|
file
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
include_metadata
|
|
Determines whether or not metadata is included in the metadata attribute on the elements in
|
|
the output.
|
|
infer_table_structure
|
|
If True, any Table elements that are extracted will also have a metadata field
|
|
named "text_as_html" where the table's text content is rendered into an html string.
|
|
I.e., rows and cells are preserved.
|
|
Whether True or False, the "text" field is always present in any Table element
|
|
and is the text content of the table (no structure).
|
|
languages
|
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
in either language.
|
|
Additional Parameters:
|
|
detect_language_per_element
|
|
Detect language per element instead of at the document level.
|
|
"""
|
|
exactly_one(filename=filename, file=file)
|
|
|
|
def validate_filename(filename: str) -> str:
|
|
"""Return path to a file confirmed to exist on the filesystem."""
|
|
if not os.path.exists(filename):
|
|
raise ValueError(f"The file {filename} does not exist.")
|
|
return filename
|
|
|
|
def copy_to_tempfile(file: IO[bytes]) -> str:
|
|
"""Return path to temporary copy of file to be converted."""
|
|
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
|
tmp.write(file.read())
|
|
return tmp.name
|
|
|
|
def extract_docx_filename(file_path: str) -> str:
|
|
"""Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
|
|
# -- a/b/foo.odt -> foo.odt --
|
|
filename = os.path.basename(file_path)
|
|
# -- foo.odt -> foo --
|
|
root_name, _ = os.path.splitext(filename)
|
|
# -- foo -> foo.docx --
|
|
return f"{root_name}.docx"
|
|
|
|
file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
|
|
pypandoc.convert_file( # pyright: ignore
|
|
file_path,
|
|
"docx",
|
|
format=source_format,
|
|
outputfile=docx_path,
|
|
)
|
|
elements = partition_docx(
|
|
filename=docx_path,
|
|
metadata_filename=metadata_filename,
|
|
include_metadata=include_metadata,
|
|
infer_table_structure=infer_table_structure,
|
|
metadata_last_modified=metadata_last_modified,
|
|
languages=languages,
|
|
detect_language_per_element=detect_language_per_element,
|
|
)
|
|
|
|
return elements
|
|
|
|
|
|
@process_metadata()
|
|
@add_metadata_with_filetype(FileType.DOCX)
|
|
@add_chunking_strategy()
|
|
def partition_docx(
|
|
filename: Optional[str] = None,
|
|
file: Optional[IO[bytes]] = None,
|
|
metadata_filename: Optional[str] = None,
|
|
include_page_breaks: bool = True,
|
|
include_metadata: bool = True, # used by decorator
|
|
infer_table_structure: bool = True,
|
|
metadata_last_modified: Optional[str] = None,
|
|
chunking_strategy: Optional[str] = None, # used by decorator
|
|
languages: Optional[List[str]] = ["auto"],
|
|
detect_language_per_element: bool = False,
|
|
**kwargs: Any, # used by decorator
|
|
) -> List[Element]:
|
|
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
|
|
|
Parameters
|
|
----------
|
|
filename
|
|
A string defining the target filename path.
|
|
file
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
infer_table_structure
|
|
If True, any Table elements that are extracted will also have a metadata field
|
|
named "text_as_html" where the table's text content is rendered into an html string.
|
|
I.e., rows and cells are preserved.
|
|
Whether True or False, the "text" field is always present in any Table element
|
|
and is the text content of the table (no structure).
|
|
metadata_filename
|
|
The filename to use for the metadata. Relevant because partition_doc converts the document
|
|
to .docx before partition. We want the original source filename in the metadata.
|
|
metadata_last_modified
|
|
The last modified date for the document.
|
|
languages
|
|
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
|
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
|
in either language.
|
|
Additional Parameters:
|
|
detect_language_per_element
|
|
Detect language per element instead of at the document level.
|
|
"""
|
|
# -- verify that only one file-specifier argument was provided --
|
|
exactly_one(filename=filename, file=file)
|
|
|
|
elements = _DocxPartitioner.iter_document_elements(
|
|
filename,
|
|
file,
|
|
metadata_filename,
|
|
include_page_breaks,
|
|
infer_table_structure,
|
|
metadata_last_modified,
|
|
)
|
|
elements = apply_lang_metadata(
|
|
elements=elements,
|
|
languages=languages,
|
|
detect_language_per_element=detect_language_per_element,
|
|
)
|
|
return list(elements)
|
|
|
|
|
|
class _DocxPartitioner:
|
|
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
|
|
|
|
def __init__(
|
|
self,
|
|
# -- NOTE(scanny): default values here are unnecessary for production use because
|
|
# -- `.iter_document_elements()` is the only interface method and always calls with all
|
|
# -- args. However, providing defaults eases unit-testing and decouples unit-tests from
|
|
# -- future changes to args.
|
|
filename: Optional[str] = None,
|
|
file: Optional[IO[bytes]] = None,
|
|
metadata_filename: Optional[str] = None,
|
|
include_page_breaks: bool = True,
|
|
infer_table_structure: bool = True,
|
|
metadata_last_modified: Optional[str] = None,
|
|
) -> None:
|
|
self._filename = filename
|
|
self._file = file
|
|
self._metadata_filename = metadata_filename
|
|
self._include_page_breaks = include_page_breaks
|
|
self._infer_table_structure = infer_table_structure
|
|
self._metadata_last_modified = metadata_last_modified
|
|
self._page_counter: int = 1
|
|
|
|
@classmethod
|
|
def iter_document_elements(
|
|
cls,
|
|
filename: Optional[str] = None,
|
|
file: Optional[IO[bytes]] = None,
|
|
metadata_filename: Optional[str] = None,
|
|
include_page_breaks: bool = True,
|
|
infer_table_structure: bool = True,
|
|
metadata_last_modified: Optional[str] = None,
|
|
) -> Iterator[Element]:
|
|
"""Partition MS Word documents (.docx format) into its document elements."""
|
|
self = cls(
|
|
filename=filename,
|
|
file=file,
|
|
metadata_filename=metadata_filename,
|
|
include_page_breaks=include_page_breaks,
|
|
infer_table_structure=infer_table_structure,
|
|
metadata_last_modified=metadata_last_modified,
|
|
)
|
|
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
|
|
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
|
|
# "section-less" document has to be interated differently and has no headers or footers and
|
|
# therefore no page-size or margins.
|
|
return (
|
|
self._iter_document_elements()
|
|
if self._document_contains_sections
|
|
else self._iter_sectionless_document_elements()
|
|
)
|
|
|
|
def _iter_document_elements(self) -> Iterator[Element]:
|
|
"""Generate each document-element in (docx) `document` in document order."""
|
|
# -- This implementation composes a collection of iterators into a "combined" iterator
|
|
# -- return value using `yield from`. You can think of the return value as an Element
|
|
# -- stream and each `yield from` as "add elements found by this function to the stream".
|
|
# -- This is functionally analogous to declaring `elements: List[Element] = []` at the top
|
|
# -- and using `elements.extend()` for the results of each of the function calls, but is
|
|
# -- more perfomant, uses less memory (avoids producing and then garbage-collecting all
|
|
# -- those small lists), is more flexible for later iterator operations like filter,
|
|
# -- chain, map, etc. and is perhaps more elegant and simpler to read once you have the
|
|
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
|
|
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
|
|
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
|
for section_idx, section in enumerate(self._document.sections):
|
|
yield from self._iter_section_page_breaks(section_idx, section)
|
|
yield from self._iter_section_headers(section)
|
|
|
|
for block_item in section.iter_inner_content():
|
|
# -- a block-item can be a Paragraph or a Table, maybe others later so elif here.
|
|
# -- Paragraph is more common so check that first.
|
|
if isinstance(block_item, Paragraph):
|
|
yield from self._iter_paragraph_elements(block_item)
|
|
# -- a paragraph can contain a page-break --
|
|
yield from self._iter_maybe_paragraph_page_breaks(block_item)
|
|
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
|
|
block_item, DocxTable
|
|
):
|
|
yield from self._iter_table_element(block_item)
|
|
|
|
yield from self._iter_section_footers(section)
|
|
|
|
def _iter_sectionless_document_elements(self) -> Iterator[Element]:
|
|
"""Generate each document-element in a docx `document` that has no sections.
|
|
|
|
A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
|
|
(because those live in a section).
|
|
"""
|
|
for block_item in self._document.iter_inner_content():
|
|
if isinstance(block_item, Paragraph):
|
|
yield from self._iter_paragraph_elements(block_item)
|
|
# -- a paragraph can contain a page-break --
|
|
yield from self._iter_maybe_paragraph_page_breaks(block_item)
|
|
# -- can only be a Paragraph or Table so far but more types may come later --
|
|
elif isinstance(block_item, DocxTable): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
yield from self._iter_table_element(block_item)
|
|
|
|
def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
|
|
"""HTML string version of `table`.
|
|
|
|
Example:
|
|
|
|
<table>
|
|
<tbody>
|
|
<tr><th>item </th><th style="text-align: right;"> qty</th></tr>
|
|
<tr><td>spam </td><td style="text-align: right;"> 42</td></tr>
|
|
<tr><td>eggs </td><td style="text-align: right;"> 451</td></tr>
|
|
<tr><td>bacon </td><td style="text-align: right;"> 0</td></tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
`is_nested` is used for recursive calls when a nested table is encountered. Certain
|
|
behaviors are different in that case, but the caller can safely ignore that parameter and
|
|
allow it to take its default value.
|
|
"""
|
|
|
|
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
|
|
for block_item in cell.iter_inner_content():
|
|
if isinstance(block_item, Paragraph):
|
|
# -- all docx content is ultimately in a paragraph; a nested table contributes
|
|
# -- structure only
|
|
yield f"{html.escape(block_item.text)}"
|
|
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
|
|
block_item, DocxTable
|
|
):
|
|
yield self._convert_table_to_html(block_item, is_nested=True)
|
|
|
|
def iter_cells(row: _Row) -> Iterator[str]:
|
|
return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)
|
|
|
|
return tabulate(
|
|
[list(iter_cells(row)) for row in table.rows],
|
|
headers=[] if is_nested else "firstrow",
|
|
# -- tabulate isn't really designed for recursive tables so we have to do any
|
|
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
|
|
# -- contents.
|
|
tablefmt="unsafehtml",
|
|
)
|
|
|
|
def _convert_table_to_plain_text(self, table: DocxTable) -> str:
|
|
"""Plain-text version of `table`.
|
|
|
|
Each row appears on its own line. Cells in a column are aligned using spaces as padding:
|
|
|
|
item qty
|
|
spam 42
|
|
eggs 451
|
|
bacon 0
|
|
|
|
"""
|
|
|
|
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
|
|
for block_item in cell.iter_inner_content():
|
|
if isinstance(block_item, Paragraph):
|
|
yield block_item.text
|
|
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
|
|
block_item, DocxTable
|
|
):
|
|
yield self._convert_table_to_plain_text(block_item)
|
|
|
|
def iter_cells(row: _Row) -> Iterator[str]:
|
|
return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)
|
|
|
|
return tabulate([list(iter_cells(row)) for row in table.rows], tablefmt="plain")
|
|
|
|
@lazyproperty
|
|
def _document(self) -> Document:
|
|
"""The python-docx `Document` object loaded from file or filename."""
|
|
filename, file = self._filename, self._file
|
|
|
|
if filename is not None:
|
|
return docx.Document(filename)
|
|
|
|
assert file is not None
|
|
if isinstance(file, SpooledTemporaryFile):
|
|
file.seek(0)
|
|
file = io.BytesIO(file.read())
|
|
return docx.Document(file)
|
|
|
|
@lazyproperty
|
|
def _document_contains_pagebreaks(self) -> bool:
|
|
"""True when there is at least one page-break detected in the document.
|
|
|
|
Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
|
|
inserted by Microsoft Word, but probably don't appear in documents converted into .docx
|
|
format from for example .odt format.
|
|
"""
|
|
xpath = (
|
|
# NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
|
|
# appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
|
|
# is w:p inner-content and both of these can occur inside a table-cell as well as the
|
|
# document body
|
|
"./w:body/w:p/w:r/w:lastRenderedPageBreak"
|
|
" | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
|
|
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
|
|
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
|
|
)
|
|
|
|
return bool(self._document.element.xpath(xpath))
|
|
|
|
@lazyproperty
|
|
def _document_contains_sections(self) -> bool:
|
|
"""True when there is at least one section in the document.
|
|
|
|
This is always true for a document produced by Word, but may not always be the case when the
|
|
document results from conversion or export. In particular, a Microsoft Teams chat-transcript
|
|
export will have no sections.
|
|
"""
|
|
return bool(self._document.sections)
|
|
|
|
def _increment_page_number(self) -> Iterator[PageBreak]:
|
|
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
|
|
self._page_counter += 1
|
|
if self._include_page_breaks:
|
|
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
|
|
|
|
def _is_list_item(self, paragraph: Paragraph) -> bool:
|
|
"""True when `paragraph` can be identified as a list-item."""
|
|
if is_bulleted_text(paragraph.text):
|
|
return True
|
|
|
|
return "<w:numPr>" in paragraph._p.xml
|
|
|
|
def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]:
|
|
"""Generate zero-or-one document element for `paragraph`.
|
|
|
|
In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph
|
|
does not contribute to the document-element stream and will not cause an element to be
|
|
emitted.
|
|
"""
|
|
text = paragraph.text
|
|
|
|
# -- blank paragraphs are commonly used for spacing between paragraphs and
|
|
# -- do not contribute to the document-element stream.
|
|
if not text.strip():
|
|
return
|
|
|
|
metadata = self._paragraph_metadata(paragraph)
|
|
|
|
# -- a list gets some special treatment --
|
|
if self._is_list_item(paragraph):
|
|
clean_text = clean_bullets(text).strip()
|
|
if clean_text:
|
|
yield ListItem(
|
|
text=clean_text,
|
|
metadata=metadata,
|
|
detection_origin=DETECTION_ORIGIN,
|
|
)
|
|
return
|
|
|
|
# -- determine element-type from an explicit Word paragraph-style if possible --
|
|
TextSubCls = self._style_based_element_type(paragraph)
|
|
if TextSubCls:
|
|
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
|
return
|
|
|
|
# -- try to recognize the element type by parsing its text --
|
|
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
|
|
if TextSubCls:
|
|
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
|
return
|
|
|
|
# -- if all that fails we give it the default `Text` element-type --
|
|
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
|
|
|
def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]:
|
|
"""Generate a `PageBreak` document element for each page-break in `paragraph`.
|
|
|
|
Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
|
|
and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
|
|
Note that soft page breaks aren't always present. Whether or not pages are
|
|
tracked may depend on your Word renderer.
|
|
"""
|
|
|
|
def has_page_break_implementation_we_have_so_far() -> bool:
|
|
"""Needs to become more sophisticated."""
|
|
page_break_indicators = [
|
|
["lastRenderedPageBreak"], # "Soft" page break inserted by renderer
|
|
]
|
|
for indicators in page_break_indicators:
|
|
if all(indicator in paragraph._p.xml for indicator in indicators):
|
|
return True
|
|
return False
|
|
|
|
if not has_page_break_implementation_we_have_so_far():
|
|
return
|
|
|
|
yield from self._increment_page_number()
|
|
|
|
def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]:
|
|
"""Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
|
|
for run in paragraph.runs:
|
|
text = run.text.strip() if run.text else ""
|
|
if not text:
|
|
continue
|
|
if run.bold:
|
|
yield {"text": text, "tag": "b"}
|
|
if run.italic:
|
|
yield {"text": text, "tag": "i"}
|
|
|
|
def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
|
|
"""Generate any `Footer` elements defined for this section.
|
|
|
|
A Word document has up to three header and footer definition pairs for each document
|
|
section, a primary, first-page, and even-page header and footer. The first-page pair
|
|
applies only to the first page of the section (perhaps a title page or chapter start). The
|
|
even-page pair is used in book-bound documents where there are both recto and verso pages
|
|
(it is applied to verso (even-numbered) pages). A page where neither more specialized
|
|
footer applies uses the primary footer.
|
|
"""
|
|
|
|
def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]:
|
|
"""Generate zero-or-one Footer elements for `footer`."""
|
|
if footer.is_linked_to_previous:
|
|
return
|
|
text = "\n".join([p.text for p in footer.paragraphs])
|
|
if not text:
|
|
return
|
|
yield Footer(
|
|
text=text,
|
|
detection_origin=DETECTION_ORIGIN,
|
|
metadata=ElementMetadata(
|
|
filename=self._metadata_filename,
|
|
header_footer_type=header_footer_type,
|
|
category_depth=0,
|
|
),
|
|
)
|
|
|
|
yield from iter_footer(section.footer, "primary")
|
|
if section.different_first_page_header_footer:
|
|
yield from iter_footer(section.first_page_footer, "first_page")
|
|
if self._document.settings.odd_and_even_pages_header_footer:
|
|
yield from iter_footer(section.even_page_footer, "even_page")
|
|
|
|
def _iter_section_headers(self, section: Section) -> Iterator[Header]:
|
|
"""Generate `Header` elements for this section if it has them.
|
|
|
|
See `._iter_section_footers()` docstring for more on docx headers and footers.
|
|
"""
|
|
|
|
def iter_header(header: _Header, header_footer_type: str) -> Iterator[Header]:
|
|
"""Generate zero-or-one Header elements for `header`."""
|
|
if header.is_linked_to_previous:
|
|
return
|
|
text = "\n".join([p.text for p in header.paragraphs])
|
|
if not text:
|
|
return
|
|
yield Header(
|
|
text=text,
|
|
detection_origin=DETECTION_ORIGIN,
|
|
metadata=ElementMetadata(
|
|
filename=self._metadata_filename,
|
|
header_footer_type=header_footer_type,
|
|
category_depth=0, # -- headers are always at the root level}
|
|
),
|
|
)
|
|
|
|
yield from iter_header(section.header, "primary")
|
|
if section.different_first_page_header_footer:
|
|
yield from iter_header(section.first_page_header, "first_page")
|
|
if self._document.settings.odd_and_even_pages_header_footer:
|
|
yield from iter_header(section.even_page_header, "even_page")
|
|
|
|
def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Iterator[PageBreak]:
|
|
"""Generate zero-or-one `PageBreak` document elements for `section`.
|
|
|
|
A docx section has a "start" type which can be "continuous" (no page-break), "nextPage",
|
|
"evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak`
|
|
element signals one page break. Here we only need to handle the case where we need to add
|
|
another, for example to go from one odd page to another odd page and we need a total of
|
|
two page-breaks.
|
|
"""
|
|
|
|
def page_is_odd() -> bool:
|
|
return self._page_counter % 2 == 1
|
|
|
|
start_type = section.start_type
|
|
|
|
# -- This method is called upon entering a new section, which happens before any paragraphs
|
|
# -- in that section are partitioned. A rendered page-break due to a section-start occurs
|
|
# -- in the first paragraph of the section and so occurs _later_ in the proces. Here we
|
|
# -- predict when two page breaks will be needed and emit one of them. The second will be
|
|
# -- emitted by the rendered page-break to follow.
|
|
|
|
if start_type == WD_SECTION_START.EVEN_PAGE: # noqa
|
|
# -- on an even page we need two total, add one to supplement the rendered page break
|
|
# -- to follow. There is no "first-document-page" special case because 1 is odd.
|
|
if not page_is_odd():
|
|
yield from self._increment_page_number()
|
|
|
|
elif start_type == WD_SECTION_START.ODD_PAGE:
|
|
# -- the first page of the document is an implicit "new" odd-page, so no page-break --
|
|
if section_idx == 0:
|
|
return
|
|
if page_is_odd():
|
|
yield from self._increment_page_number()
|
|
|
|
# -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
|
|
# -- which need our help to get the page-breaks right.
|
|
return
|
|
|
|
def _iter_table_element(self, table: DocxTable) -> Iterator[Table]:
|
|
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
|
|
# -- at present, we always generate exactly one Table element, but we might want
|
|
# -- to skip, for example, an empty table.
|
|
html_table = self._convert_table_to_html(table) if self._infer_table_structure else None
|
|
text_table = self._convert_table_to_plain_text(table)
|
|
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
|
|
|
|
yield Table(
|
|
text_table,
|
|
detection_origin=DETECTION_ORIGIN,
|
|
metadata=ElementMetadata(
|
|
text_as_html=html_table,
|
|
filename=self._metadata_filename,
|
|
page_number=self._page_number,
|
|
last_modified=self._last_modified,
|
|
emphasized_text_contents=emphasized_text_contents or None,
|
|
emphasized_text_tags=emphasized_text_tags or None,
|
|
),
|
|
)
|
|
|
|
def _iter_table_emphasis(self, table: DocxTable) -> Iterator[Dict[str, str]]:
|
|
"""Generate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`."""
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for paragraph in cell.paragraphs:
|
|
yield from self._iter_paragraph_emphasis(paragraph)
|
|
|
|
@lazyproperty
|
|
def _last_modified(self) -> Optional[str]:
|
|
"""Last-modified date suitable for use in element metadata."""
|
|
# -- if this file was converted from another format, any last-modified date for the file
|
|
# -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
|
|
if self._metadata_last_modified:
|
|
return self._metadata_last_modified
|
|
|
|
file_path, file = self._filename, self._file
|
|
|
|
# -- if the file is on the filesystem, get its date from there --
|
|
if file_path is not None:
|
|
return None if file_path.startswith("/tmp") else get_last_modified_date(file_path)
|
|
|
|
# -- otherwise try getting it from the file-like object (unlikely since BytesIO and its
|
|
# -- brethren have no such metadata).
|
|
assert file is not None
|
|
return get_last_modified_date_from_file(file)
|
|
|
|
@property
|
|
def _page_number(self) -> Optional[int]:
|
|
"""The current page number, or None if we can't really tell.
|
|
|
|
Page numbers are not added to element metadata if we can't find any page-breaks in the
|
|
document (which may be a common case).
|
|
|
|
In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
|
|
page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
|
|
target device. Explicit (hard) page-breaks are always recorded in the docx file but the
|
|
rendered page-breaks are only added optionally.
|
|
"""
|
|
return self._page_counter if self._document_contains_pagebreaks else None
|
|
|
|
def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str]]:
|
|
"""[contents, tags] pair describing emphasized text in `paragraph`."""
|
|
iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
|
|
return ([e["text"] for e in iter_p_emph], [e["tag"] for e in iter_p_emph_2])
|
|
|
|
def _paragraph_link_meta(self, paragraph: Paragraph) -> Tuple[List[str], List[str], List[Link]]:
|
|
"""Describes hyperlinks in `paragraph`, if any."""
|
|
if not paragraph.hyperlinks:
|
|
return [], [], []
|
|
|
|
def iter_paragraph_links() -> Iterator[Link]:
|
|
"""Generate `Link` typed-dict for each external link in `paragraph`.
|
|
|
|
Word uses hyperlinks for internal "jumps" within the document, as well as for web and
|
|
other external locations. Only generate the external ones.
|
|
"""
|
|
offset = 0
|
|
for item in paragraph.iter_inner_content():
|
|
if isinstance(item, Run):
|
|
offset += len(item.text)
|
|
elif isinstance(item, Hyperlink): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
text = item.text
|
|
url = item.url
|
|
start_index = offset
|
|
offset += len(text)
|
|
# -- docx hyperlinks include "internal" links, like a table-of-contents
|
|
# -- (TOC) entry has a jump to the named heading in the document (e.g.
|
|
# -- '#_Toc147925734'. Such links have a fragment but not an address
|
|
# -- (URL). Treat those as regular text.
|
|
if not url:
|
|
continue
|
|
# -- all Word hyperlinks should contain text, otherwise they have no
|
|
# -- visual appearance on the document. Not expected, but technically possible
|
|
# -- so filter these out too.
|
|
if not text:
|
|
continue
|
|
yield Link(text=text, url=url, start_index=start_index)
|
|
|
|
links = list(iter_paragraph_links())
|
|
# -- link["text"] is allowed to be None by the declared type for `Link`, but never will be
|
|
# -- here because such a link is filtered out above. Use empty str to satisfy type-checker.
|
|
link_texts = [link["text"] or "" for link in links]
|
|
link_urls = [link["url"] for link in links]
|
|
return link_texts, link_urls, links
|
|
|
|
def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata:
|
|
"""ElementMetadata object describing `paragraph`."""
|
|
category_depth = self._parse_category_depth_by_style(paragraph)
|
|
emphasized_text_contents, emphasized_text_tags = self._paragraph_emphasis(paragraph)
|
|
link_texts, link_urls, links = self._paragraph_link_meta(paragraph)
|
|
element_metadata = ElementMetadata(
|
|
category_depth=category_depth,
|
|
emphasized_text_contents=emphasized_text_contents or None,
|
|
emphasized_text_tags=emphasized_text_tags or None,
|
|
filename=self._metadata_filename,
|
|
last_modified=self._last_modified,
|
|
link_texts=link_texts or None,
|
|
link_urls=link_urls or None,
|
|
links=links or None,
|
|
page_number=self._page_number,
|
|
)
|
|
element_metadata.detection_origin = "docx"
|
|
return element_metadata
|
|
|
|
def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
|
|
"""Attempt to differentiate the element-type by inspecting the raw text."""
|
|
text = paragraph.text.strip()
|
|
|
|
if len(text) < 2:
|
|
return None
|
|
if is_us_city_state_zip(text):
|
|
return Address
|
|
if is_email_address(text):
|
|
return EmailAddress
|
|
if is_possible_narrative_text(text):
|
|
return NarrativeText
|
|
if is_possible_title(text):
|
|
return Title
|
|
|
|
return None
|
|
|
|
def _style_based_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
|
|
"""Element-type for `paragraph` based on its paragraph-style.
|
|
|
|
Returns `None` when the style doesn't tell us anything useful, including when it
|
|
is the default "Normal" style.
|
|
"""
|
|
# NOTE(robinson) - documentation on built-in styles at the link below:
|
|
# https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \
|
|
# #paragraph-styles-in-default-template
|
|
STYLE_TO_ELEMENT_MAPPING = {
|
|
"Caption": Text, # TODO(robinson) - add caption element type
|
|
"Heading 1": Title,
|
|
"Heading 2": Title,
|
|
"Heading 3": Title,
|
|
"Heading 4": Title,
|
|
"Heading 5": Title,
|
|
"Heading 6": Title,
|
|
"Heading 7": Title,
|
|
"Heading 8": Title,
|
|
"Heading 9": Title,
|
|
"Intense Quote": Text, # TODO(robinson) - add quote element type
|
|
"List": ListItem,
|
|
"List 2": ListItem,
|
|
"List 3": ListItem,
|
|
"List Bullet": ListItem,
|
|
"List Bullet 2": ListItem,
|
|
"List Bullet 3": ListItem,
|
|
"List Continue": ListItem,
|
|
"List Continue 2": ListItem,
|
|
"List Continue 3": ListItem,
|
|
"List Number": ListItem,
|
|
"List Number 2": ListItem,
|
|
"List Number 3": ListItem,
|
|
"List Paragraph": ListItem,
|
|
"Macro Text": Text,
|
|
"No Spacing": Text,
|
|
"Quote": Text, # TODO(robinson) - add quote element type
|
|
"Subtitle": Title,
|
|
"TOCHeading": Title,
|
|
"Title": Title,
|
|
}
|
|
|
|
# -- paragraph.style can be None in rare cases, so can style.name. That's going
|
|
# -- to mean default style which is equivalent to "Normal" for our purposes.
|
|
style_name = (paragraph.style and paragraph.style.name) or "Normal"
|
|
|
|
# NOTE(robinson) - The "Normal" style name will return None since it's not
|
|
# in the mapping. Unknown style names will also return None.
|
|
return STYLE_TO_ELEMENT_MAPPING.get(style_name)
|
|
|
|
def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]:
|
|
"""[contents, tags] pair describing emphasized text in `table`."""
|
|
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
|
|
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
|
|
|
|
def _parse_category_depth_by_style(self, paragraph: Paragraph) -> int:
|
|
"""Determine category depth from paragraph metadata"""
|
|
|
|
# Determine category depth from paragraph ilvl xpath
|
|
xpath = paragraph._element.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
|
|
if xpath:
|
|
return int(xpath[0])
|
|
|
|
# Determine category depth from style name
|
|
style_name = (paragraph.style and paragraph.style.name) or "Normal"
|
|
depth = self._parse_category_depth_by_style_name(style_name)
|
|
|
|
if depth > 0:
|
|
return depth
|
|
else:
|
|
# Check if category depth can be determined from style ilvl
|
|
return self._parse_category_depth_by_style_ilvl()
|
|
|
|
def _parse_category_depth_by_style_name(self, style_name: str) -> int:
|
|
"""Parse category-depth from the style-name of `paragraph`.
|
|
|
|
Category depth is 0-indexed and relative to the other element types in the document.
|
|
"""
|
|
|
|
def _extract_number(suffix: str) -> int:
|
|
return int(suffix.split()[-1]) - 1 if suffix.split()[-1].isdigit() else 0
|
|
|
|
# Heading styles
|
|
if style_name.startswith("Heading"):
|
|
return _extract_number(style_name)
|
|
|
|
if style_name == "Subtitle":
|
|
return 1
|
|
|
|
# List styles
|
|
list_prefixes = ["List", "List Bullet", "List Continue", "List Number"]
|
|
if any(style_name.startswith(prefix) for prefix in list_prefixes):
|
|
return _extract_number(style_name)
|
|
|
|
# Other styles
|
|
return 0
|
|
|
|
def _parse_category_depth_by_style_ilvl(self) -> int:
|
|
# TODO(newelh) Parsing category depth by style ilvl is not yet implemented
|
|
return 0
|