mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 22:23:24 +00:00
fix: adds to list of extensions to check if a file has a plain text MIME type (#916)
* added .txt, .text, and .tab to text file list * changelog and version
This commit is contained in:
parent
f7b3c0f741
commit
9b830693bd
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
||||
## 0.8.2-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
|
||||
has a `text/plain` MIME type.
|
||||
|
||||
## 0.8.1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.1" # pragma: no cover
|
||||
__version__ = "0.8.2-dev0" # pragma: no cover
|
||||
|
||||
@ -179,6 +179,7 @@ EXT_TO_FILETYPE = {
|
||||
".odt": FileType.ODT,
|
||||
".csv": FileType.CSV,
|
||||
".tsv": FileType.TSV,
|
||||
".tab": FileType.TSV,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".js": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
@ -196,6 +197,21 @@ EXT_TO_FILETYPE = {
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
PLAIN_TEXT_EXTENSIONS = [
|
||||
".txt",
|
||||
".text",
|
||||
".eml",
|
||||
".md",
|
||||
".rtf",
|
||||
".html",
|
||||
".rst",
|
||||
".org",
|
||||
".csv",
|
||||
".tsv",
|
||||
".tab",
|
||||
".json",
|
||||
]
|
||||
|
||||
|
||||
def _resolve_symlink(file_path):
|
||||
# Resolve the symlink to get the actual file path
|
||||
@ -284,7 +300,7 @@ def detect_filetype(
|
||||
encoding = "utf-8"
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".csv", ".tsv", ".json"]:
|
||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user