Fix extracting value from field (#3774)

This commit is contained in:
Pluto 2024-11-07 19:21:39 +01:00 committed by GitHub
parent 66d1e5a5cb
commit c2d17b1ca4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 27 additions and 5 deletions

View File

@ -1,4 +1,4 @@
## 0.16.5-dev0
## 0.16.5-dev1
### Enhancements

View File

@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
for i in range(len(expected_json_elements)):
assert expected_json_elements[i] == predicted_elements[i]
assert (
expected_json_elements[i].metadata.text_as_html
== predicted_elements[i].metadata.text_as_html
)
def test_inline_elements_are_squeezed():

View File

@ -607,6 +607,21 @@ def test_text_in_form_field_value():
assert form_field_value.to_text() == "Random Input Value"
def test_text_in_form_field_value_with_null_value():
# language=HTML
input_html = """
<div class="Page">
<input class="FormFieldValue" value=""/>
</div>
"""
page = parse_html_to_ontology(input_html)
assert len(page.children) == 1
form_field_value = page.children[0]
assert form_field_value.text == ""
assert form_field_value.to_text() == ""
def test_to_text_when_form_field():
ontology = Page(
children=[

View File

@ -1 +1 @@
__version__ = "0.16.5-dev0" # pragma: no cover
__version__ = "0.16.5-dev1" # pragma: no cover

View File

@ -93,7 +93,7 @@ class OntologyElement(BaseModel):
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
return children_text
return BeautifulSoup(self.to_html()).get_text().strip()
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement):
allowed_tags: List[str] = Field(["input"], frozen=True)
def to_text(self, add_children=True) -> str:
text = super().to_text() + self.additional_attributes.get("value", "")
return text.strip()
text = super().to_text()
value = self.additional_attributes.get("value", "")
if not value:
return text
return f"{text} {value}".strip()
class Checkbox(OntologyElement):