Fix: support hml filetype in partition as a variation of html (#586)

* quick fx to add hml filetype

* changelog and version
This commit is contained in:
Yuming Long 2023-05-15 16:35:53 -04:00 committed by GitHub
parent 5b6f11bb88
commit 33cc3f8637
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 5 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.6.7-dev2
## 0.6.7-dev3
### Enhancements
@ -10,6 +10,7 @@
### Fixes
* Supports `hml` filetype for partition as a variation of html filetype.
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.

View File

@ -1 +1 @@
__version__ = "0.6.7-dev2" # pragma: no cover
__version__ = "0.6.7-dev3" # pragma: no cover

View File

@ -166,6 +166,7 @@ EXT_TO_FILETYPE = {
".text": FileType.TXT,
".eml": FileType.EML,
".xml": FileType.XML,
".htm": FileType.HTML,
".html": FileType.HTML,
".md": FileType.MD,
".xlsx": FileType.XLSX,
@ -266,7 +267,7 @@ def detect_filetype(
return FileType.RTF
elif mime_type.endswith("xml"):
if extension and extension == ".html":
if extension and (extension == ".html" or extension == ".htm"):
return FileType.HTML
else:
return FileType.XML