mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 22:55:07 +00:00
Fix: support hml filetype in partition as a variation of html (#586)
* quick fx to add hml filetype * changelog and version
This commit is contained in:
parent
5b6f11bb88
commit
33cc3f8637
@ -1,4 +1,4 @@
|
||||
## 0.6.7-dev2
|
||||
## 0.6.7-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Supports `hml` filetype for partition as a variation of html filetype.
|
||||
* Makes `pytesseract` a function level import in `partition_pdf` so you can use the `"fast"`
|
||||
or `"hi_res"` strategies if `pytesseract` is not installed. Also adds the
|
||||
`required_dependencies` decorator for the `"hi_res"` and `"ocr_only"` strategies.
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.7-dev2" # pragma: no cover
|
||||
__version__ = "0.6.7-dev3" # pragma: no cover
|
||||
|
||||
@ -166,6 +166,7 @@ EXT_TO_FILETYPE = {
|
||||
".text": FileType.TXT,
|
||||
".eml": FileType.EML,
|
||||
".xml": FileType.XML,
|
||||
".htm": FileType.HTML,
|
||||
".html": FileType.HTML,
|
||||
".md": FileType.MD,
|
||||
".xlsx": FileType.XLSX,
|
||||
@ -266,7 +267,7 @@ def detect_filetype(
|
||||
return FileType.RTF
|
||||
|
||||
elif mime_type.endswith("xml"):
|
||||
if extension and extension == ".html":
|
||||
if extension and (extension == ".html" or extension == ".htm"):
|
||||
return FileType.HTML
|
||||
else:
|
||||
return FileType.XML
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user