Fix bytes in markdown converter playwright (#6044)

Fix error:

TypeError: Input stream must be opened in bytes mode, not in text mode.

Markdown converter takes binary stream
This commit is contained in:
Hussein Mozannar 2025-03-20 14:53:53 -04:00 committed by GitHub
parent 46add11ec7
commit fef953e062
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -565,8 +565,11 @@ class PlaywrightController:
assert page is not None
if self._markdown_converter is None and markitdown is not None:
self._markdown_converter = markitdown.MarkItDown()
assert self._markdown_converter is not None
html = await page.evaluate("document.documentElement.outerHTML;")
res = self._markdown_converter.convert_stream(io.StringIO(html), file_extension=".html", url=page.url) # type: ignore
res = self._markdown_converter.convert_stream(
io.BytesIO(html.encode("utf-8")), file_extension=".html", url=page.url
)
assert hasattr(res, "text_content") and isinstance(res.text_content, str)
return res.text_content
else: