import os import pytest from lxml import etree from unstructured.documents import html from unstructured.documents.base import Page from unstructured.documents.elements import ( Address, ListItem, NarrativeText, Text, Title, ) from unstructured.documents.html import ( HEADING_TAGS, LIST_ITEM_TAGS, TABLE_TAGS, TEXT_TAGS, HTMLDocument, HTMLNarrativeText, HTMLTitle, TagsMixin, ) TAGS = ( "