mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
fix: guess HTML content starting with script tag (#1673)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3942923125
commit
984cb137f6
@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
||||
else:
|
||||
return "application/xml"
|
||||
|
||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
||||
if re.match(
|
||||
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
||||
content_str,
|
||||
re.DOTALL,
|
||||
):
|
||||
return "text/html"
|
||||
|
||||
p = re.compile(
|
||||
|
@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||
|
||||
html_str = ( # HTML starting with a script
|
||||
"<script>\nconsole.log('foo');\n</script>"
|
||||
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
||||
)
|
||||
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
||||
assert dci._guess_format(stream) == InputFormat.HTML
|
||||
|
||||
# Valid MD
|
||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||
|
Loading…
x
Reference in New Issue
Block a user