fix: guess HTML content starting with script tag (#1673)

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-06-02 08:43:24 +02:00 committed by GitHub
parent 3942923125
commit 984cb137f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 1 deletions

View File

@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
else:
return "application/xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
if re.match(
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
content_str,
re.DOTALL,
):
return "text/html"
p = re.compile(

View File

@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML
html_str = ( # HTML starting with a script
"<script>\nconsole.log('foo');\n</script>"
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
)
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
assert dci._guess_format(stream) == InputFormat.HTML
# Valid MD
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf)