2024-11-15 15:30:48 +01:00
|
|
|
from collections import defaultdict
|
2024-11-20 14:01:28 +01:00
|
|
|
from typing import Type
|
2024-11-15 15:30:48 +01:00
|
|
|
|
|
|
|
from unstructured.documents import elements, ontology
|
|
|
|
from unstructured.documents.mappings import (
|
|
|
|
ALL_ONTOLOGY_ELEMENT_TYPES,
|
|
|
|
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
|
|
|
|
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
|
|
|
|
get_all_subclasses,
|
|
|
|
)
|
|
|
|
from unstructured.documents.ontology import OntologyElement
|
|
|
|
|
|
|
|
|
2024-11-20 14:01:28 +01:00
|
|
|
def test_if_all_html_tags_have_default_ontology_type():
|
|
|
|
html_tag_to_possible_ontology_classes: dict[str, list[Type[ontology.OntologyElement]]] = (
|
|
|
|
defaultdict(list)
|
|
|
|
)
|
2024-11-15 15:30:48 +01:00
|
|
|
|
2024-11-20 14:01:28 +01:00
|
|
|
for ontology_class in ALL_ONTOLOGY_ELEMENT_TYPES:
|
|
|
|
for tag in ontology_class().allowed_tags:
|
|
|
|
html_tag_to_possible_ontology_classes[tag].append(ontology_class)
|
2024-11-15 15:30:48 +01:00
|
|
|
|
2024-11-20 14:01:28 +01:00
|
|
|
for html_tag, possible_ontology_classes in html_tag_to_possible_ontology_classes.items():
|
|
|
|
assert html_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
|
|
|
|
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[html_tag] in possible_ontology_classes + [
|
|
|
|
ontology.UncategorizedText
|
|
|
|
] # In some cases it is better to use unknown type than assign incorrect type
|
2024-11-15 15:30:48 +01:00
|
|
|
|
|
|
|
|
|
|
|
def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
|
|
|
|
for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
|
|
|
|
assert issubclass(element_type, OntologyElement)
|
|
|
|
|
|
|
|
|
|
|
|
def test_ontology_to_unstructured_mapping_has_valid_types():
|
|
|
|
for (
|
|
|
|
ontology_element,
|
|
|
|
unstructured_element,
|
|
|
|
) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
|
|
|
|
assert issubclass(unstructured_element, elements.Element)
|
|
|
|
assert issubclass(ontology_element, ontology.OntologyElement)
|
|
|
|
|
|
|
|
|
|
|
|
def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
|
|
|
|
for ontology_element in get_all_subclasses(ontology.OntologyElement):
|
|
|
|
assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE
|