import os import pathlib import pytest from lxml import etree from unstructured.documents import html from unstructured.documents.base import Page from unstructured.documents.elements import ( Address, ListItem, NarrativeText, Table, Text, Title, ) from unstructured.documents.html import ( HEADING_TAGS, LIST_ITEM_TAGS, SECTION_TAGS, TABLE_TAGS, TEXT_TAGS, HTMLDocument, HTMLNarrativeText, HTMLTitle, TagsMixin, ) DIRECTORY = pathlib.Path(__file__).parent.resolve() TAGS = ( "