mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 16:07:37 +00:00
Fix extracting value from field (#3774)
This commit is contained in:
parent
66d1e5a5cb
commit
c2d17b1ca4
@ -1,4 +1,4 @@
|
|||||||
## 0.16.5-dev0
|
## 0.16.5-dev1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
|||||||
@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
|
|||||||
|
|
||||||
for i in range(len(expected_json_elements)):
|
for i in range(len(expected_json_elements)):
|
||||||
assert expected_json_elements[i] == predicted_elements[i]
|
assert expected_json_elements[i] == predicted_elements[i]
|
||||||
|
assert (
|
||||||
|
expected_json_elements[i].metadata.text_as_html
|
||||||
|
== predicted_elements[i].metadata.text_as_html
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_inline_elements_are_squeezed():
|
def test_inline_elements_are_squeezed():
|
||||||
|
|||||||
@ -607,6 +607,21 @@ def test_text_in_form_field_value():
|
|||||||
assert form_field_value.to_text() == "Random Input Value"
|
assert form_field_value.to_text() == "Random Input Value"
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_in_form_field_value_with_null_value():
|
||||||
|
# language=HTML
|
||||||
|
input_html = """
|
||||||
|
<div class="Page">
|
||||||
|
<input class="FormFieldValue" value=""/>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
page = parse_html_to_ontology(input_html)
|
||||||
|
|
||||||
|
assert len(page.children) == 1
|
||||||
|
form_field_value = page.children[0]
|
||||||
|
assert form_field_value.text == ""
|
||||||
|
assert form_field_value.to_text() == ""
|
||||||
|
|
||||||
|
|
||||||
def test_to_text_when_form_field():
|
def test_to_text_when_form_field():
|
||||||
ontology = Page(
|
ontology = Page(
|
||||||
children=[
|
children=[
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.5-dev0" # pragma: no cover
|
__version__ = "0.16.5-dev1" # pragma: no cover
|
||||||
|
|||||||
@ -93,7 +93,7 @@ class OntologyElement(BaseModel):
|
|||||||
if self.children and add_children:
|
if self.children and add_children:
|
||||||
children_text = " ".join(child.to_text().strip() for child in self.children)
|
children_text = " ".join(child.to_text().strip() for child in self.children)
|
||||||
return children_text
|
return children_text
|
||||||
return BeautifulSoup(self.to_html()).get_text().strip()
|
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
|
||||||
|
|
||||||
def _construct_attribute_string(self, attributes: dict) -> str:
|
def _construct_attribute_string(self, attributes: dict) -> str:
|
||||||
return " ".join(
|
return " ".join(
|
||||||
@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement):
|
|||||||
allowed_tags: List[str] = Field(["input"], frozen=True)
|
allowed_tags: List[str] = Field(["input"], frozen=True)
|
||||||
|
|
||||||
def to_text(self, add_children=True) -> str:
|
def to_text(self, add_children=True) -> str:
|
||||||
text = super().to_text() + self.additional_attributes.get("value", "")
|
text = super().to_text()
|
||||||
return text.strip()
|
value = self.additional_attributes.get("value", "")
|
||||||
|
if not value:
|
||||||
|
return text
|
||||||
|
return f"{text} {value}".strip()
|
||||||
|
|
||||||
|
|
||||||
class Checkbox(OntologyElement):
|
class Checkbox(OntologyElement):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user