fix: recognize code files with auto (#677)

* add check for code mime type

* add file extensions

* add new tests

* version and changelog
This commit is contained in:
Matt Robinson 2023-06-02 16:09:43 -04:00 committed by GitHub
parent 6c10d8f022
commit cf0ff91e37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 2 deletions

View File

@ -1,4 +1,4 @@
## 0.7.2-dev0
## 0.7.2-dev1
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* File detection now detects code files as plain text
* Adds `tabulate` explicitly to dependencies
* Fixes an issue in `metadata.page_number` of pptx files

View File

@ -8,6 +8,7 @@ import pytest
from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
FileType,
_is_code_mime_type,
_is_text_file_a_json,
detect_filetype,
)
@ -132,6 +133,23 @@ def test_detect_text_csv(monkeypatch, filename="sample-docs/stanley-cup.csv"):
assert filetype == FileType.CSV
def test_detect_text_python_from_filename(monkeypatch, filename="unstructured/logger.py"):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
filetype = detect_filetype(filename=filename)
assert filetype == FileType.TXT
def test_detect_text_python_from_file(monkeypatch, filename="unstructured/logger.py"):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
with open(filename, "rb") as f:
filetype = detect_filetype(file=f)
assert filetype == FileType.TXT
def test_detects_go_mime_type():
assert _is_code_mime_type("text/x-go") is True
def test_detect_xml_application_rtf(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")

View File

@ -1 +1 @@
__version__ = "0.7.2-dev0" # pragma: no cover
__version__ = "0.7.2-dev1" # pragma: no cover

View File

@ -160,6 +160,20 @@ EXT_TO_FILETYPE = {
".msg": FileType.MSG,
".odt": FileType.ODT,
".csv": FileType.CSV,
# NOTE(robinson) - for now we are treating code files as plain text
".js": FileType.TXT,
".py": FileType.TXT,
".java": FileType.TXT,
".cpp": FileType.TXT,
".cc": FileType.TXT,
".cxx": FileType.TXT,
".c": FileType.TXT,
".cs": FileType.TXT,
".php": FileType.TXT,
".rb": FileType.TXT,
".swift": FileType.TXT,
".ts": FileType.TXT,
".go": FileType.TXT,
None: FileType.UNK,
}
@ -265,6 +279,12 @@ def detect_filetype(
else:
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
elif _is_code_mime_type(mime_type):
# NOTE(robinson) - we'll treat all code files as plain text for now.
# we can update this logic and add filetypes for specific languages
# later if needed.
return FileType.TXT
# For everything else
elif mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
@ -363,6 +383,32 @@ def document_to_element_list(
return elements
PROGRAMMING_LANGUAGES = [
"javascript",
"python",
"java",
"c++",
"cpp",
"csharp",
"c#",
"php",
"ruby",
"swift",
"typescript",
]
def _is_code_mime_type(mime_type: str) -> bool:
"""Checks to see if the MIME type is a MIME type that would be used for a code
file."""
mime_type = mime_type.lower()
# NOTE(robinson) - check this one explicitly to avoid conflicts with other
# MIME types that contain "go"
if mime_type == "text/x-go":
return True
return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
def add_metadata_with_filetype(filetype: FileType):
def decorator(func: Callable):
@wraps(func)