mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 06:36:06 +00:00
fix: recognize code files with auto (#677)
* add check for code mime type * add file extensions * add new tests * version and changelog
This commit is contained in:
parent
6c10d8f022
commit
cf0ff91e37
@ -1,4 +1,4 @@
|
||||
## 0.7.2-dev0
|
||||
## 0.7.2-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* File detection now detects code files as plain text
|
||||
* Adds `tabulate` explicitly to dependencies
|
||||
* Fixes an issue in `metadata.page_number` of pptx files
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import pytest
|
||||
from unstructured.file_utils import filetype
|
||||
from unstructured.file_utils.filetype import (
|
||||
FileType,
|
||||
_is_code_mime_type,
|
||||
_is_text_file_a_json,
|
||||
detect_filetype,
|
||||
)
|
||||
@ -132,6 +133,23 @@ def test_detect_text_csv(monkeypatch, filename="sample-docs/stanley-cup.csv"):
|
||||
assert filetype == FileType.CSV
|
||||
|
||||
|
||||
def test_detect_text_python_from_filename(monkeypatch, filename="unstructured/logger.py"):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.TXT
|
||||
|
||||
|
||||
def test_detect_text_python_from_file(monkeypatch, filename="unstructured/logger.py"):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
assert filetype == FileType.TXT
|
||||
|
||||
|
||||
def test_detects_go_mime_type():
|
||||
assert _is_code_mime_type("text/x-go") is True
|
||||
|
||||
|
||||
def test_detect_xml_application_rtf(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.7.2-dev0" # pragma: no cover
|
||||
__version__ = "0.7.2-dev1" # pragma: no cover
|
||||
|
||||
@ -160,6 +160,20 @@ EXT_TO_FILETYPE = {
|
||||
".msg": FileType.MSG,
|
||||
".odt": FileType.ODT,
|
||||
".csv": FileType.CSV,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".js": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
".java": FileType.TXT,
|
||||
".cpp": FileType.TXT,
|
||||
".cc": FileType.TXT,
|
||||
".cxx": FileType.TXT,
|
||||
".c": FileType.TXT,
|
||||
".cs": FileType.TXT,
|
||||
".php": FileType.TXT,
|
||||
".rb": FileType.TXT,
|
||||
".swift": FileType.TXT,
|
||||
".ts": FileType.TXT,
|
||||
".go": FileType.TXT,
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
@ -265,6 +279,12 @@ def detect_filetype(
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
|
||||
|
||||
elif _is_code_mime_type(mime_type):
|
||||
# NOTE(robinson) - we'll treat all code files as plain text for now.
|
||||
# we can update this logic and add filetypes for specific languages
|
||||
# later if needed.
|
||||
return FileType.TXT
|
||||
|
||||
# For everything else
|
||||
elif mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
@ -363,6 +383,32 @@ def document_to_element_list(
|
||||
return elements
|
||||
|
||||
|
||||
PROGRAMMING_LANGUAGES = [
|
||||
"javascript",
|
||||
"python",
|
||||
"java",
|
||||
"c++",
|
||||
"cpp",
|
||||
"csharp",
|
||||
"c#",
|
||||
"php",
|
||||
"ruby",
|
||||
"swift",
|
||||
"typescript",
|
||||
]
|
||||
|
||||
|
||||
def _is_code_mime_type(mime_type: str) -> bool:
|
||||
"""Checks to see if the MIME type is a MIME type that would be used for a code
|
||||
file."""
|
||||
mime_type = mime_type.lower()
|
||||
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
|
||||
# MIME types that contain "go"
|
||||
if mime_type == "text/x-go":
|
||||
return True
|
||||
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
|
||||
|
||||
|
||||
def add_metadata_with_filetype(filetype: FileType):
|
||||
def decorator(func: Callable):
|
||||
@wraps(func)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user