diff --git a/CHANGELOG.md b/CHANGELOG.md index ad2554ee7..e58bff999 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.7.2-dev0 +## 0.7.2-dev1 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* File detection now detects code files as plain text * Adds `tabulate` explicitly to dependencies * Fixes an issue in `metadata.page_number` of pptx files diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 619e43e71..03f2155b8 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -8,6 +8,7 @@ import pytest from unstructured.file_utils import filetype from unstructured.file_utils.filetype import ( FileType, + _is_code_mime_type, _is_text_file_a_json, detect_filetype, ) @@ -132,6 +133,23 @@ def test_detect_text_csv(monkeypatch, filename="sample-docs/stanley-cup.csv"): assert filetype == FileType.CSV +def test_detect_text_python_from_filename(monkeypatch, filename="unstructured/logger.py"): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python") + filetype = detect_filetype(filename=filename) + assert filetype == FileType.TXT + + +def test_detect_text_python_from_file(monkeypatch, filename="unstructured/logger.py"): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python") + with open(filename, "rb") as f: + filetype = detect_filetype(file=f) + assert filetype == FileType.TXT + + +def test_detects_go_mime_type(): + assert _is_code_mime_type("text/x-go") is True + + def test_detect_xml_application_rtf(monkeypatch): monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fa41b869c..468d86e23 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.2-dev0" # pragma: no cover +__version__ = "0.7.2-dev1" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 7b6cd12f5..f4eb29c1c 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -160,6 +160,20 @@ EXT_TO_FILETYPE = { ".msg": FileType.MSG, ".odt": FileType.ODT, ".csv": FileType.CSV, + # NOTE(robinson) - for now we are treating code files as plain text + ".js": FileType.TXT, + ".py": FileType.TXT, + ".java": FileType.TXT, + ".cpp": FileType.TXT, + ".cc": FileType.TXT, + ".cxx": FileType.TXT, + ".c": FileType.TXT, + ".cs": FileType.TXT, + ".php": FileType.TXT, + ".rb": FileType.TXT, + ".swift": FileType.TXT, + ".ts": FileType.TXT, + ".go": FileType.TXT, None: FileType.UNK, } @@ -265,6 +279,12 @@ def detect_filetype( else: return EXT_TO_FILETYPE.get(extension.lower(), filetype) + elif _is_code_mime_type(mime_type): + # NOTE(robinson) - we'll treat all code files as plain text for now. + # we can update this logic and add filetypes for specific languages + # later if needed. + return FileType.TXT + # For everything else elif mime_type in STR_TO_FILETYPE: return STR_TO_FILETYPE[mime_type] @@ -363,6 +383,32 @@ def document_to_element_list( return elements +PROGRAMMING_LANGUAGES = [ + "javascript", + "python", + "java", + "c++", + "cpp", + "csharp", + "c#", + "php", + "ruby", + "swift", + "typescript", +] + + +def _is_code_mime_type(mime_type: str) -> bool: + """Checks to see if the MIME type is a MIME type that would be used for a code + file.""" + mime_type = mime_type.lower() + # NOTE(robinson) - check this one explicitly to avoid conflicts with other + # MIME types that contain "go" + if mime_type == "text/x-go": + return True + return any(language in mime_type for language in PROGRAMMING_LANGUAGES) + + def add_metadata_with_filetype(filetype: FileType): def decorator(func: Callable): @wraps(func)