unstructured/unstructured/partition/docx.py

# pyright: reportPrivateUsage=false

from __future__ import annotations

import html
import io
import itertools
import os
import tempfile
from tempfile import SpooledTemporaryFile
from typing import (
    IO,
    Any,
    Dict,
    Iterator,
    List,
    Optional,
    Tuple,
    Type,
    Union,
    cast,
)

# -- CT_* stands for "complex-type", an XML element type in docx parlance --
import docx
from docx.document import Document
from docx.enum.section import WD_SECTION_START
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.section import Section, _Footer, _Header
from docx.table import Table as DocxTable
from docx.table import _Cell, _Row
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tabulate import tabulate
from typing_extensions import TypeAlias

from unstructured.chunking.title import add_chunking_strategy
from unstructured.cleaners.core import clean_bullets
from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
    EmailAddress,
    Footer,
    Header,
    Link,
    ListItem,
    NarrativeText,
    PageBreak,
    Table,
    Text,
    Title,
    process_metadata,
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
    exactly_one,
    get_last_modified_date,
    get_last_modified_date_from_file,
)
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import (
    is_bulleted_text,
    is_email_address,
    is_possible_narrative_text,
    is_possible_title,
    is_us_city_state_zip,
)
from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies

if dependency_exists("pypandoc"):
    import pypandoc

DETECTION_ORIGIN: str = "docx"
BlockElement: TypeAlias = Union[CT_P, CT_Tbl]
BlockItem: TypeAlias = Union[Paragraph, DocxTable]


@requires_dependencies("pypandoc")
def convert_and_partition_docx(
    source_format: str,
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    include_metadata: bool = True,
    infer_table_structure: bool = True,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
) -> List[Element]:
    """Converts a document to DOCX and then partitions it using partition_docx.

    Works with any file format support by pandoc.

    Parameters
    ----------
    source_format
        The format of the source document, .e.g. odt
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the metadata attribute on the elements in
        the output.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    exactly_one(filename=filename, file=file)

    def validate_filename(filename: str) -> str:
        """Return path to a file confirmed to exist on the filesystem."""
        if not os.path.exists(filename):
            raise ValueError(f"The file {filename} does not exist.")
        return filename

    def copy_to_tempfile(file: IO[bytes]) -> str:
        """Return path to temporary copy of file to be converted."""
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            tmp.write(file.read())
            return tmp.name

    def extract_docx_filename(file_path: str) -> str:
        """Return a filename like "foo.docx" from a path like "a/b/foo.odt" """
        # -- a/b/foo.odt -> foo.odt --
        filename = os.path.basename(file_path)
        # -- foo.odt -> foo --
        root_name, _ = os.path.splitext(filename)
        # -- foo -> foo.docx --
        return f"{root_name}.docx"

    file_path = validate_filename(filename) if filename else copy_to_tempfile(cast(IO[bytes], file))

    with tempfile.TemporaryDirectory() as tmpdir:
        docx_path = os.path.join(tmpdir, extract_docx_filename(file_path))
        pypandoc.convert_file(  # pyright: ignore
            file_path,
            "docx",
            format=source_format,
            outputfile=docx_path,
        )
        elements = partition_docx(
            filename=docx_path,
            metadata_filename=metadata_filename,
            include_metadata=include_metadata,
            infer_table_structure=infer_table_structure,
            metadata_last_modified=metadata_last_modified,
            languages=languages,
            detect_language_per_element=detect_language_per_element,
        )

    return elements


@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy()
def partition_docx(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    metadata_filename: Optional[str] = None,
    include_page_breaks: bool = True,
    include_metadata: bool = True,  # used by decorator
    infer_table_structure: bool = True,
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,  # used by decorator
    languages: Optional[List[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,  # used by decorator
) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the document
        to .docx before partition. We want the original source filename in the metadata.
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    # -- verify that only one file-specifier argument was provided --
    exactly_one(filename=filename, file=file)

    elements = _DocxPartitioner.iter_document_elements(
        filename,
        file,
        metadata_filename,
        include_page_breaks,
        infer_table_structure,
        metadata_last_modified,
    )
    elements = apply_lang_metadata(
        elements=elements,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
    )
    return list(elements)


class _DocxPartitioner:
    """Provides `.partition()` for MS-Word 2007+ (.docx) files."""

    def __init__(
        self,
        # -- NOTE(scanny): default values here are unnecessary for production use because
        # -- `.iter_document_elements()` is the only interface method and always calls with all
        # -- args. However, providing defaults eases unit-testing and decouples unit-tests from
        # -- future changes to args.
        filename: Optional[str] = None,
        file: Optional[IO[bytes]] = None,
        metadata_filename: Optional[str] = None,
        include_page_breaks: bool = True,
        infer_table_structure: bool = True,
        metadata_last_modified: Optional[str] = None,
    ) -> None:
        self._filename = filename
        self._file = file
        self._metadata_filename = metadata_filename
        self._include_page_breaks = include_page_breaks
        self._infer_table_structure = infer_table_structure
        self._metadata_last_modified = metadata_last_modified
        self._page_counter: int = 1

    @classmethod
    def iter_document_elements(
        cls,
        filename: Optional[str] = None,
        file: Optional[IO[bytes]] = None,
        metadata_filename: Optional[str] = None,
        include_page_breaks: bool = True,
        infer_table_structure: bool = True,
        metadata_last_modified: Optional[str] = None,
    ) -> Iterator[Element]:
        """Partition MS Word documents (.docx format) into its document elements."""
        self = cls(
            filename=filename,
            file=file,
            metadata_filename=metadata_filename,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            metadata_last_modified=metadata_last_modified,
        )
        # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
        # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
        # "section-less" document has to be interated differently and has no headers or footers and
        # therefore no page-size or margins.
        return (
            self._iter_document_elements()
            if self._document_contains_sections
            else self._iter_sectionless_document_elements()
        )

    def _iter_document_elements(self) -> Iterator[Element]:
        """Generate each document-element in (docx) `document` in document order."""
        # -- This implementation composes a collection of iterators into a "combined" iterator
        # -- return value using `yield from`. You can think of the return value as an Element
        # -- stream and each `yield from` as "add elements found by this function to the stream".
        # -- This is functionally analogous to declaring `elements: List[Element] = []` at the top
        # -- and using `elements.extend()` for the results of each of the function calls, but is
        # -- more perfomant, uses less memory (avoids producing and then garbage-collecting all
        # -- those small lists), is more flexible for later iterator operations like filter,
        # -- chain, map, etc. and is perhaps more elegant and simpler to read once you have the
        # -- concept of what it's doing. You can see the same pattern repeating in the "sub"
        # -- functions like `._iter_paragraph_elements()` where the "just return when done"
        # -- characteristic of a generator avoids repeated code to form interim results into lists.
        for section_idx, section in enumerate(self._document.sections):
            yield from self._iter_section_page_breaks(section_idx, section)
            yield from self._iter_section_headers(section)

            for block_item in section.iter_inner_content():
                # -- a block-item can be a Paragraph or a Table, maybe others later so elif here.
                # -- Paragraph is more common so check that first.
                if isinstance(block_item, Paragraph):
                    yield from self._iter_paragraph_elements(block_item)
                    # -- a paragraph can contain a page-break --
                    yield from self._iter_maybe_paragraph_page_breaks(block_item)
                elif isinstance(  # pyright: ignore[reportUnnecessaryIsInstance]
                    block_item, DocxTable
                ):
                    yield from self._iter_table_element(block_item)

            yield from self._iter_section_footers(section)

    def _iter_sectionless_document_elements(self) -> Iterator[Element]:
        """Generate each document-element in a docx `document` that has no sections.

        A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
        (because those live in a section).
        """
        for block_item in self._document.iter_inner_content():
            if isinstance(block_item, Paragraph):
                yield from self._iter_paragraph_elements(block_item)
                # -- a paragraph can contain a page-break --
                yield from self._iter_maybe_paragraph_page_breaks(block_item)
            # -- can only be a Paragraph or Table so far but more types may come later --
            elif isinstance(block_item, DocxTable):  # pyright: ignore[reportUnnecessaryIsInstance]
                yield from self._iter_table_element(block_item)

    def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
        """HTML string version of `table`.

        Example:

            <table>
            <tbody>
            <tr><th>item  </th><th style="text-align: right;">  qty</th></tr>
            <tr><td>spam  </td><td style="text-align: right;">   42</td></tr>
            <tr><td>eggs  </td><td style="text-align: right;">  451</td></tr>
            <tr><td>bacon </td><td style="text-align: right;">    0</td></tr>
            </tbody>
            </table>

        `is_nested` is used for recursive calls when a nested table is encountered. Certain
        behaviors are different in that case, but the caller can safely ignore that parameter and
        allow it to take its default value.
        """

        def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
            for block_item in cell.iter_inner_content():
                if isinstance(block_item, Paragraph):
                    # -- all docx content is ultimately in a paragraph; a nested table contributes
                    # -- structure only
                    yield f"{html.escape(block_item.text)}"
                elif isinstance(  # pyright: ignore[reportUnnecessaryIsInstance]
                    block_item, DocxTable
                ):
                    yield self._convert_table_to_html(block_item, is_nested=True)

        def iter_cells(row: _Row) -> Iterator[str]:
            return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)

        return tabulate(
            [list(iter_cells(row)) for row in table.rows],
            headers=[] if is_nested else "firstrow",
            # -- tabulate isn't really designed for recursive tables so we have to do any
            # -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
            # -- contents.
            tablefmt="unsafehtml",
        )

    def _convert_table_to_plain_text(self, table: DocxTable) -> str:
        """Plain-text version of `table`.

        Each row appears on its own line. Cells in a column are aligned using spaces as padding:

            item      qty
            spam       42
            eggs      451
            bacon       0

        """

        def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
            for block_item in cell.iter_inner_content():
                if isinstance(block_item, Paragraph):
                    yield block_item.text
                elif isinstance(  # pyright: ignore[reportUnnecessaryIsInstance]
                    block_item, DocxTable
                ):
                    yield self._convert_table_to_plain_text(block_item)

        def iter_cells(row: _Row) -> Iterator[str]:
            return ("\n".join(iter_cell_block_items(cell)) for cell in row.cells)

        return tabulate([list(iter_cells(row)) for row in table.rows], tablefmt="plain")

    @lazyproperty
    def _document(self) -> Document:
        """The python-docx `Document` object loaded from file or filename."""
        filename, file = self._filename, self._file

        if filename is not None:
            return docx.Document(filename)

        assert file is not None
        if isinstance(file, SpooledTemporaryFile):
            file.seek(0)
            file = io.BytesIO(file.read())
        return docx.Document(file)

    @lazyproperty
    def _document_contains_pagebreaks(self) -> bool:
        """True when there is at least one page-break detected in the document.

        Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
        inserted by Microsoft Word, but probably don't appear in documents converted into .docx
        format from for example .odt format.
        """
        xpath = (
            # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
            # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
            # is w:p inner-content and both of these can occur inside a table-cell as well as the
            # document body
            "./w:body/w:p/w:r/w:lastRenderedPageBreak"
            " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
            " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
        )

        return bool(self._document.element.xpath(xpath))

    @lazyproperty
    def _document_contains_sections(self) -> bool:
        """True when there is at least one section in the document.

        This is always true for a document produced by Word, but may not always be the case when the
        document results from conversion or export. In particular, a Microsoft Teams chat-transcript
        export will have no sections.
        """
        return bool(self._document.sections)

    def _increment_page_number(self) -> Iterator[PageBreak]:
        """Increment page-number by 1 and generate a PageBreak element if enabled."""
        self._page_counter += 1
        if self._include_page_breaks:
            yield PageBreak("", detection_origin=DETECTION_ORIGIN)

    def _is_list_item(self, paragraph: Paragraph) -> bool:
        """True when `paragraph` can be identified as a list-item."""
        if is_bulleted_text(paragraph.text):
            return True

        return "<w:numPr>" in paragraph._p.xml

    def _iter_paragraph_elements(self, paragraph: Paragraph) -> Iterator[Element]:
        """Generate zero-or-one document element for `paragraph`.

        In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph
        does not contribute to the document-element stream and will not cause an element to be
        emitted.
        """
        text = paragraph.text

        # -- blank paragraphs are commonly used for spacing between paragraphs and
        # -- do not contribute to the document-element stream.
        if not text.strip():
            return

        metadata = self._paragraph_metadata(paragraph)

        # -- a list gets some special treatment --
        if self._is_list_item(paragraph):
            clean_text = clean_bullets(text).strip()
            if clean_text:
                yield ListItem(
                    text=clean_text,
                    metadata=metadata,
                    detection_origin=DETECTION_ORIGIN,
                )
            return

        # -- determine element-type from an explicit Word paragraph-style if possible --
        TextSubCls = self._style_based_element_type(paragraph)
        if TextSubCls:
            yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
            return

        # -- try to recognize the element type by parsing its text --
        TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
        if TextSubCls:
            yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
            return

        # -- if all that fails we give it the default `Text` element-type --
        yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)

    def _iter_maybe_paragraph_page_breaks(self, paragraph: Paragraph) -> Iterator[PageBreak]:
        """Generate a `PageBreak` document element for each page-break in `paragraph`.

        Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
        and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
        Note that soft page breaks aren't always present. Whether or not pages are
        tracked may depend on your Word renderer.
        """

        def has_page_break_implementation_we_have_so_far() -> bool:
            """Needs to become more sophisticated."""
            page_break_indicators = [
                ["lastRenderedPageBreak"],  # "Soft" page break inserted by renderer
            ]
            for indicators in page_break_indicators:
                if all(indicator in paragraph._p.xml for indicator in indicators):
                    return True
            return False

        if not has_page_break_implementation_we_have_so_far():
            return

        yield from self._increment_page_number()

    def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[Dict[str, str]]:
        """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
        for run in paragraph.runs:
            text = run.text.strip() if run.text else ""
            if not text:
                continue
            if run.bold:
                yield {"text": text, "tag": "b"}
            if run.italic:
                yield {"text": text, "tag": "i"}

    def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
        """Generate any `Footer` elements defined for this section.

        A Word document has up to three header and footer definition pairs for each document
        section, a primary, first-page, and even-page header and footer. The first-page pair
        applies only to the first page of the section (perhaps a title page or chapter start). The
        even-page pair is used in book-bound documents where there are both recto and verso pages
        (it is applied to verso (even-numbered) pages). A page where neither more specialized
        footer applies uses the primary footer.
        """

        def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]:
            """Generate zero-or-one Footer elements for `footer`."""
            if footer.is_linked_to_previous:
                return
            text = "\n".join([p.text for p in footer.paragraphs])
            if not text:
                return
            yield Footer(
                text=text,
                detection_origin=DETECTION_ORIGIN,
                metadata=ElementMetadata(
                    filename=self._metadata_filename,
                    header_footer_type=header_footer_type,
                    category_depth=0,
                ),
            )

        yield from iter_footer(section.footer, "primary")
        if section.different_first_page_header_footer:
            yield from iter_footer(section.first_page_footer, "first_page")
        if self._document.settings.odd_and_even_pages_header_footer:
            yield from iter_footer(section.even_page_footer, "even_page")

    def _iter_section_headers(self, section: Section) -> Iterator[Header]:
        """Generate `Header` elements for this section if it has them.

        See `._iter_section_footers()` docstring for more on docx headers and footers.
        """

        def iter_header(header: _Header, header_footer_type: str) -> Iterator[Header]:
            """Generate zero-or-one Header elements for `header`."""
            if header.is_linked_to_previous:
                return
            text = "\n".join([p.text for p in header.paragraphs])
            if not text:
                return
            yield Header(
                text=text,
                detection_origin=DETECTION_ORIGIN,
                metadata=ElementMetadata(
                    filename=self._metadata_filename,
                    header_footer_type=header_footer_type,
                    category_depth=0,  # -- headers are always at the root level}
                ),
            )

        yield from iter_header(section.header, "primary")
        if section.different_first_page_header_footer:
            yield from iter_header(section.first_page_header, "first_page")
        if self._document.settings.odd_and_even_pages_header_footer:
            yield from iter_header(section.even_page_header, "even_page")

    def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Iterator[PageBreak]:
        """Generate zero-or-one `PageBreak` document elements for `section`.

        A docx section has a "start" type which can be "continuous" (no page-break), "nextPage",
        "evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak`
        element signals one page break. Here we only need to handle the case where we need to add
        another, for example to go from one odd page to another odd page and we need a total of
        two page-breaks.
        """

        def page_is_odd() -> bool:
            return self._page_counter % 2 == 1

        start_type = section.start_type

        # -- This method is called upon entering a new section, which happens before any paragraphs
        # -- in that section are partitioned. A rendered page-break due to a section-start occurs
        # -- in the first paragraph of the section and so occurs _later_ in the proces. Here we
        # -- predict when two page breaks will be needed and emit one of them. The second will be
        # -- emitted by the rendered page-break to follow.

        if start_type == WD_SECTION_START.EVEN_PAGE:  # noqa
            # -- on an even page we need two total, add one to supplement the rendered page break
            # -- to follow. There is no "first-document-page" special case because 1 is odd.
            if not page_is_odd():
                yield from self._increment_page_number()

        elif start_type == WD_SECTION_START.ODD_PAGE:
            # -- the first page of the document is an implicit "new" odd-page, so no page-break --
            if section_idx == 0:
                return
            if page_is_odd():
                yield from self._increment_page_number()

        # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
        # -- which need our help to get the page-breaks right.
        return

    def _iter_table_element(self, table: DocxTable) -> Iterator[Table]:
        """Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
        # -- at present, we always generate exactly one Table element, but we might want
        # -- to skip, for example, an empty table.
        html_table = self._convert_table_to_html(table) if self._infer_table_structure else None
        text_table = self._convert_table_to_plain_text(table)
        emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)

        yield Table(
            text_table,
            detection_origin=DETECTION_ORIGIN,
            metadata=ElementMetadata(
                text_as_html=html_table,
                filename=self._metadata_filename,
                page_number=self._page_number,
                last_modified=self._last_modified,
                emphasized_text_contents=emphasized_text_contents or None,
                emphasized_text_tags=emphasized_text_tags or None,
            ),
        )

    def _iter_table_emphasis(self, table: DocxTable) -> Iterator[Dict[str, str]]:
        """Generate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`."""
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    yield from self._iter_paragraph_emphasis(paragraph)

    @lazyproperty
    def _last_modified(self) -> Optional[str]:
        """Last-modified date suitable for use in element metadata."""
        # -- if this file was converted from another format, any last-modified date for the file
        # -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
        if self._metadata_last_modified:
            return self._metadata_last_modified

        file_path, file = self._filename, self._file

        # -- if the file is on the filesystem, get its date from there --
        if file_path is not None:
            return None if file_path.startswith("/tmp") else get_last_modified_date(file_path)

        # -- otherwise try getting it from the file-like object (unlikely since BytesIO and its
        # -- brethren have no such metadata).
        assert file is not None
        return get_last_modified_date_from_file(file)

    @property
    def _page_number(self) -> Optional[int]:
        """The current page number, or None if we can't really tell.

        Page numbers are not added to element metadata if we can't find any page-breaks in the
        document (which may be a common case).

        In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
        page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
        target device. Explicit (hard) page-breaks are always recorded in the docx file but the
        rendered page-breaks are only added optionally.
        """
        return self._page_counter if self._document_contains_pagebreaks else None

    def _paragraph_emphasis(self, paragraph: Paragraph) -> Tuple[List[str], List[str]]:
        """[contents, tags] pair describing emphasized text in `paragraph`."""
        iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
        return ([e["text"] for e in iter_p_emph], [e["tag"] for e in iter_p_emph_2])

    def _paragraph_link_meta(self, paragraph: Paragraph) -> Tuple[List[str], List[str], List[Link]]:
        """Describes hyperlinks in `paragraph`, if any."""
        if not paragraph.hyperlinks:
            return [], [], []

        def iter_paragraph_links() -> Iterator[Link]:
            """Generate `Link` typed-dict for each external link in `paragraph`.

            Word uses hyperlinks for internal "jumps" within the document, as well as for web and
            other external locations. Only generate the external ones.
            """
            offset = 0
            for item in paragraph.iter_inner_content():
                if isinstance(item, Run):
                    offset += len(item.text)
                elif isinstance(item, Hyperlink):  # pyright: ignore[reportUnnecessaryIsInstance]
                    text = item.text
                    url = item.url
                    start_index = offset
                    offset += len(text)
                    # -- docx hyperlinks include "internal" links, like a table-of-contents
                    # -- (TOC) entry has a jump to the named heading in the document (e.g.
                    # -- '#_Toc147925734'. Such links have a fragment but not an address
                    # -- (URL). Treat those as regular text.
                    if not url:
                        continue
                    # -- all Word hyperlinks should contain text, otherwise they have no
                    # -- visual appearance on the document. Not expected, but technically possible
                    # -- so filter these out too.
                    if not text:
                        continue
                    yield Link(text=text, url=url, start_index=start_index)

        links = list(iter_paragraph_links())
        # -- link["text"] is allowed to be None by the declared type for `Link`, but never will be
        # -- here because such a link is filtered out above. Use empty str to satisfy type-checker.
        link_texts = [link["text"] or "" for link in links]
        link_urls = [link["url"] for link in links]
        return link_texts, link_urls, links

    def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata:
        """ElementMetadata object describing `paragraph`."""
        category_depth = self._parse_category_depth_by_style(paragraph)
        emphasized_text_contents, emphasized_text_tags = self._paragraph_emphasis(paragraph)
        link_texts, link_urls, links = self._paragraph_link_meta(paragraph)
        element_metadata = ElementMetadata(
            category_depth=category_depth,
            emphasized_text_contents=emphasized_text_contents or None,
            emphasized_text_tags=emphasized_text_tags or None,
            filename=self._metadata_filename,
            last_modified=self._last_modified,
            link_texts=link_texts or None,
            link_urls=link_urls or None,
            links=links or None,
            page_number=self._page_number,
        )
        element_metadata.detection_origin = "docx"
        return element_metadata

    def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
        """Attempt to differentiate the element-type by inspecting the raw text."""
        text = paragraph.text.strip()

        if len(text) < 2:
            return None
        if is_us_city_state_zip(text):
            return Address
        if is_email_address(text):
            return EmailAddress
        if is_possible_narrative_text(text):
            return NarrativeText
        if is_possible_title(text):
            return Title

        return None

    def _style_based_element_type(self, paragraph: Paragraph) -> Optional[Type[Text]]:
        """Element-type for `paragraph` based on its paragraph-style.

        Returns `None` when the style doesn't tell us anything useful, including when it
        is the default "Normal" style.
        """
        # NOTE(robinson) - documentation on built-in styles at the link below:
        # https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html \
        # #paragraph-styles-in-default-template
        STYLE_TO_ELEMENT_MAPPING = {
            "Caption": Text,  # TODO(robinson) - add caption element type
            "Heading 1": Title,
            "Heading 2": Title,
            "Heading 3": Title,
            "Heading 4": Title,
            "Heading 5": Title,
            "Heading 6": Title,
            "Heading 7": Title,
            "Heading 8": Title,
            "Heading 9": Title,
            "Intense Quote": Text,  # TODO(robinson) - add quote element type
            "List": ListItem,
            "List 2": ListItem,
            "List 3": ListItem,
            "List Bullet": ListItem,
            "List Bullet 2": ListItem,
            "List Bullet 3": ListItem,
            "List Continue": ListItem,
            "List Continue 2": ListItem,
            "List Continue 3": ListItem,
            "List Number": ListItem,
            "List Number 2": ListItem,
            "List Number 3": ListItem,
            "List Paragraph": ListItem,
            "Macro Text": Text,
            "No Spacing": Text,
            "Quote": Text,  # TODO(robinson) - add quote element type
            "Subtitle": Title,
            "TOCHeading": Title,
            "Title": Title,
        }

        # -- paragraph.style can be None in rare cases, so can style.name. That's going
        # -- to mean default style which is equivalent to "Normal" for our purposes.
        style_name = (paragraph.style and paragraph.style.name) or "Normal"

        # NOTE(robinson) - The "Normal" style name will return None since it's not
        # in the mapping. Unknown style names will also return None.
        return STYLE_TO_ELEMENT_MAPPING.get(style_name)

    def _table_emphasis(self, table: DocxTable) -> Tuple[List[str], List[str]]:
        """[contents, tags] pair describing emphasized text in `table`."""
        iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
        return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])

    def _parse_category_depth_by_style(self, paragraph: Paragraph) -> int:
        """Determine category depth from paragraph metadata"""

        # Determine category depth from paragraph ilvl xpath
        xpath = paragraph._element.xpath("./w:pPr/w:numPr/w:ilvl/@w:val")
        if xpath:
            return int(xpath[0])

        # Determine category depth from style name
        style_name = (paragraph.style and paragraph.style.name) or "Normal"
        depth = self._parse_category_depth_by_style_name(style_name)

        if depth > 0:
            return depth
        else:
            # Check if category depth can be determined from style ilvl
            return self._parse_category_depth_by_style_ilvl()

    def _parse_category_depth_by_style_name(self, style_name: str) -> int:
        """Parse category-depth from the style-name of `paragraph`.

        Category depth is 0-indexed and relative to the other element types in the document.
        """

        def _extract_number(suffix: str) -> int:
            return int(suffix.split()[-1]) - 1 if suffix.split()[-1].isdigit() else 0

        # Heading styles
        if style_name.startswith("Heading"):
            return _extract_number(style_name)

        if style_name == "Subtitle":
            return 1

        # List styles
        list_prefixes = ["List", "List Bullet", "List Continue", "List Number"]
        if any(style_name.startswith(prefix) for prefix in list_prefixes):
            return _extract_number(style_name)

        # Other styles
        return 0

    def _parse_category_depth_by_style_ilvl(self) -> int:
        # TODO(newelh) Parsing category depth by style ilvl is not yet implemented
        return 0