mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-26 15:42:15 +00:00 
			
		
		
		
	fix: refine filetype detection (#3828)
**Summary** Fixes a bug where a CSV file with asserted content-type `application/vnd.ms-excel` was incorrectly identified as an XLS file and failed partitioning. **Additional Context** The `content_type` argument to partitioning is often authored by the client system (e.g. Unstructured SDK) and is both unreliable and outside the control of the user. In this case the `.csv -> XLS` mapping is correct for certain purposes (Excel is often used to load and edit CSV files) but not for partitioning, and the user has no readily available way to override the mapping. XLS files as well as seven other common binary file types can be efficiently detected 100% of the time (at least 99.999%) using code we already have in the file detector. - Promote this direct-inspection strategy to be tried first. - When DOC, DOCX, EPUB, ODT, PPT, PPTX, XLS, or XLSX is detected, use that file-type. - When one of those types is NOT detected, clear the asserted `content_type` when it matches any of those types. This prevents the problem seen in the bug where the asserted content type was used to determine the file-type. - The remaining content_type, guess MIME-type, and filename-extension mapping strategies are tried, in that order, only when direct inspection fails. This is largely the same as it was before. - Fix #3781 while we were in the neighborhood. - Fix #3596 as well, essentially an earlier report of #3781.
This commit is contained in:
		
							parent
							
								
									10f0d54ac2
								
							
						
					
					
						commit
						b5ff79d8db
					
				| @ -1,4 +1,4 @@ | ||||
| ## 0.16.12-dev2 | ||||
| ## 0.16.12-dev3 | ||||
| 
 | ||||
| ### Enhancements | ||||
| 
 | ||||
| @ -9,6 +9,7 @@ | ||||
| ### Fixes | ||||
| 
 | ||||
| - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. | ||||
| - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. | ||||
| 
 | ||||
| ## 0.16.11 | ||||
| 
 | ||||
|  | ||||
| @ -14,15 +14,14 @@ from test_unstructured.unit_utils import ( | ||||
|     LogCaptureFixture, | ||||
|     Mock, | ||||
|     example_doc_path, | ||||
|     function_mock, | ||||
|     patch, | ||||
|     property_mock, | ||||
| ) | ||||
| from unstructured.file_utils.filetype import ( | ||||
|     _FileTypeDetectionContext, | ||||
|     _OleFileDifferentiator, | ||||
|     _OleFileDetector, | ||||
|     _TextFileDifferentiator, | ||||
|     _ZipFileDifferentiator, | ||||
|     _ZipFileDetector, | ||||
|     detect_filetype, | ||||
|     is_json_processable, | ||||
| ) | ||||
| @ -31,7 +30,41 @@ from unstructured.file_utils.model import FileType | ||||
| is_in_docker = os.path.exists("/.dockerenv") | ||||
| 
 | ||||
| # ================================================================================================ | ||||
| # STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL | ||||
| # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) | ||||
| # ================================================================================================ | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.EPUB, "winter-sports.epub"), | ||||
|         (FileType.ODT, "simple.odt"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection( | ||||
|     file_name: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- disable other strategies; no content-type, guessed MIME-type or extension -- | ||||
|     ctx_mime_type_.return_value = None | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     file_type = detect_filetype(file=file) | ||||
| 
 | ||||
|     # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not | ||||
|     # -- fall back to MIME-type guessing for any of these test cases. | ||||
|     ctx_mime_type_.assert_not_called() | ||||
|     assert file_type == expected_value | ||||
| 
 | ||||
| 
 | ||||
| # ================================================================================================ | ||||
| # STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL | ||||
| # ================================================================================================ | ||||
| 
 | ||||
| 
 | ||||
| @ -40,41 +73,21 @@ is_in_docker = os.path.exists("/.dockerenv") | ||||
|     [ | ||||
|         (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), | ||||
|         (FileType.CSV, "stanley-cups.csv", "text/csv"), | ||||
|         (FileType.DOC, "simple.doc", "application/msword"), | ||||
|         ( | ||||
|             FileType.DOCX, | ||||
|             "simple.docx", | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         ), | ||||
|         (FileType.EML, "eml/fake-email.eml", "message/rfc822"), | ||||
|         (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic", "image/heic"), | ||||
|         (FileType.HTML, "example-10k-1p.html", "text/html"), | ||||
|         (FileType.JPG, "img/example.jpg", "image/jpeg"), | ||||
|         (FileType.JSON, "spring-weather.html.json", "application/json"), | ||||
|         (FileType.MD, "README.md", "text/markdown"), | ||||
|         (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), | ||||
|         (FileType.ORG, "README.org", "text/org"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png", "image/png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), | ||||
|         ( | ||||
|             FileType.PPTX, | ||||
|             "fake-power-point.pptx", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|         ), | ||||
|         (FileType.RST, "README.rst", "text/x-rst"), | ||||
|         (FileType.RTF, "fake-doc.rtf", "text/rtf"), | ||||
|         (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), | ||||
|         (FileType.TSV, "stanley-cups.tsv", "text/tsv"), | ||||
|         (FileType.TXT, "norwich-city.txt", "text/plain"), | ||||
|         (FileType.WAV, "CantinaBand3.wav", "audio/wav"), | ||||
|         (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), | ||||
|         ( | ||||
|             FileType.XLSX, | ||||
|             "stanley-cups.xlsx", | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|         ), | ||||
|         (FileType.XML, "factbook.xml", "application/xml"), | ||||
|         (FileType.ZIP, "simple.zip", "application/zip"), | ||||
|     ], | ||||
| @ -82,13 +95,13 @@ is_in_docker = os.path.exists("/.dockerenv") | ||||
| def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( | ||||
|     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- disable strategy #2, leaving only asserted content-type and extension -- | ||||
|     # -- disable mime-guessing leaving only asserted content-type and extension -- | ||||
|     ctx_mime_type_.return_value = None | ||||
| 
 | ||||
|     file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) | ||||
| 
 | ||||
|     # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not | ||||
|     # -- fall back to strategy 2 for any of these test cases. | ||||
|     # -- Content-type strategy should not need to refer to guessed MIME-type and detection should | ||||
|     # not -- fall back to strategy 2 for any of these test cases. | ||||
|     ctx_mime_type_.assert_not_called() | ||||
|     assert file_type == expected_value | ||||
| 
 | ||||
| @ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte | ||||
|     [ | ||||
|         (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), | ||||
|         (FileType.CSV, "stanley-cups.csv", "text/csv"), | ||||
|         (FileType.DOC, "simple.doc", "application/msword"), | ||||
|         ( | ||||
|             FileType.DOCX, | ||||
|             "simple.docx", | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         ), | ||||
|         (FileType.EML, "eml/fake-email.eml", "message/rfc822"), | ||||
|         (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic", "image/heic"), | ||||
|         (FileType.HTML, "example-10k-1p.html", "text/html"), | ||||
|         (FileType.JPG, "img/example.jpg", "image/jpeg"), | ||||
|         (FileType.JSON, "spring-weather.html.json", "application/json"), | ||||
|         (FileType.MD, "README.md", "text/markdown"), | ||||
|         (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), | ||||
|         (FileType.ORG, "README.org", "text/org"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png", "image/png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), | ||||
|         ( | ||||
|             FileType.PPTX, | ||||
|             "fake-power-point.pptx", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|         ), | ||||
|         (FileType.RST, "README.rst", "text/x-rst"), | ||||
|         (FileType.RTF, "fake-doc.rtf", "text/rtf"), | ||||
|         (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), | ||||
|         (FileType.TSV, "stanley-cups.tsv", "text/tsv"), | ||||
|         (FileType.TXT, "norwich-city.txt", "text/plain"), | ||||
|         (FileType.WAV, "CantinaBand3.wav", "audio/wav"), | ||||
|         (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), | ||||
|         ( | ||||
|             FileType.XLSX, | ||||
|             "stanley-cups.xlsx", | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|         ), | ||||
|         (FileType.XML, "factbook.xml", "application/xml"), | ||||
|         (FileType.ZIP, "simple.zip", "application/zip"), | ||||
|     ], | ||||
| @ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte | ||||
| def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( | ||||
|     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- disable strategy #2 (guessed mime-type) -- | ||||
|     # -- disable mime-guessing -- | ||||
|     ctx_mime_type_.return_value = None | ||||
|     # -- disable strategy #3 (filename extension) by supplying no source of file name -- | ||||
|     # -- disable filename extension mapping by supplying no source of file name -- | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     file_type = detect_filetype(file=file, content_type=content_type) | ||||
| 
 | ||||
|     # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not | ||||
|     # -- fall-back to strategy 2 for any of these test cases. | ||||
|     ctx_mime_type_.assert_not_called() | ||||
|     assert file_type is expected_value | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "content_type", | ||||
|     [ | ||||
|         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type( | ||||
|     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- disable strategies 2 & 3, content-type strategy should get this on its own -- | ||||
|     ctx_mime_type_.return_value = None | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     file_type = detect_filetype(file=file, content_type=content_type) | ||||
| 
 | ||||
|     # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not | ||||
|     # -- fall-back to strategy 2 for any of these test cases. | ||||
|     ctx_mime_type_.assert_not_called() | ||||
|     assert file_type is expected_value | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "content_type", | ||||
|     [ | ||||
|         "application/msword", | ||||
|         "application/vnd.ms-outlook", | ||||
|         "application/vnd.ms-powerpoint", | ||||
|         "application/vnd.ms-excel", | ||||
|         "anything/else", | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type( | ||||
|     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     """Fixes wrong XLS asserted as DOC, PPT, etc. | ||||
| 
 | ||||
|     Asserted content-type can be anything except `None` and differentiator will fix it if the file | ||||
|     is DOC, PPT, or XLS type. | ||||
|     """ | ||||
|     # -- disable strategies 2 & 3, content-type strategy should get this on its own -- | ||||
|     ctx_mime_type_.return_value = None | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     file_type = detect_filetype(file=file, content_type=content_type) | ||||
| 
 | ||||
|     # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not | ||||
|     # -- fall-back to strategy 2 for any of these test cases. | ||||
|     # -- Content-type strategy should not need to refer to guessed MIME-type and detection should | ||||
|     # -- not fall-back to strategy 2 for any of these test cases. | ||||
|     ctx_mime_type_.assert_not_called() | ||||
|     assert file_type is expected_value | ||||
| 
 | ||||
| 
 | ||||
| # ================================================================================================ | ||||
| # STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC | ||||
| # STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY | ||||
| # ================================================================================================ | ||||
| 
 | ||||
| 
 | ||||
| @ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ | ||||
|         (FileType.CSV, "stanley-cups.csv", "text/csv"), | ||||
|         (FileType.CSV, "stanley-cups.csv", "application/csv"), | ||||
|         (FileType.CSV, "stanley-cups.csv", "application/x-csv"), | ||||
|         (FileType.DOC, "simple.doc", "application/msword"), | ||||
|         ( | ||||
|             FileType.DOCX, | ||||
|             "simple.docx", | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         ), | ||||
|         (FileType.EML, "eml/fake-email.eml", "message/rfc822"), | ||||
|         (FileType.EPUB, "winter-sports.epub", "application/epub"), | ||||
|         (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic", "image/heic"), | ||||
|         (FileType.HTML, "example-10k-1p.html", "text/html"), | ||||
|         (FileType.JPG, "img/example.jpg", "image/jpeg"), | ||||
|         (FileType.JSON, "spring-weather.html.json", "application/json"), | ||||
|         (FileType.MD, "README.md", "text/markdown"), | ||||
|         (FileType.MD, "README.md", "text/x-markdown"), | ||||
|         (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), | ||||
|         (FileType.ORG, "README.org", "text/org"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png", "image/png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), | ||||
|         ( | ||||
|             FileType.PPTX, | ||||
|             "fake-power-point.pptx", | ||||
|             "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|         ), | ||||
|         (FileType.RST, "README.rst", "text/x-rst"), | ||||
|         (FileType.RTF, "fake-doc.rtf", "text/rtf"), | ||||
|         (FileType.RTF, "fake-doc.rtf", "application/rtf"), | ||||
| @ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ | ||||
|         (FileType.TXT, "norwich-city.txt", "text/plain"), | ||||
|         (FileType.TXT, "simple.yaml", "text/yaml"), | ||||
|         (FileType.WAV, "CantinaBand3.wav", "audio/wav"), | ||||
|         (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), | ||||
|         ( | ||||
|             FileType.XLSX, | ||||
|             "stanley-cups.xlsx", | ||||
|             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|         ), | ||||
|         (FileType.XML, "factbook.xml", "application/xml"), | ||||
|         (FileType.XML, "factbook.xml", "text/xml"), | ||||
|         (FileType.ZIP, "simple.zip", "application/zip"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( | ||||
| def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type( | ||||
|     file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- libmagic guesses a MIME-type mapped to a `FileType` -- | ||||
| @ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     # -- disable strategy #1 by not asserting a content_type in the call -- | ||||
|     # -- disable content-type strategy by not asserting a content_type in the call -- | ||||
|     file_type = detect_filetype(file=file) | ||||
| 
 | ||||
|     # -- ctx.mime_type may be referenced multiple times, but at least once -- | ||||
| @ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec | ||||
|     [ | ||||
|         (FileType.BMP, "img/bmp_24.bmp"), | ||||
|         (FileType.CSV, "stanley-cups.csv"), | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.EML, "eml/fake-email.eml"), | ||||
|         (FileType.EPUB, "winter-sports.epub"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic"), | ||||
|         (FileType.HTML, "ideas-page.html"), | ||||
|         (FileType.JPG, "img/example.jpg"), | ||||
|         (FileType.JSON, "spring-weather.html.json"), | ||||
|         (FileType.ODT, "simple.odt"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.RTF, "fake-doc.rtf"), | ||||
|         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), | ||||
|         (FileType.TXT, "norwich-city.txt"), | ||||
|         (FileType.WAV, "CantinaBand3.wav"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|         (FileType.XML, "factbook.xml"), | ||||
|         (FileType.ZIP, "simple.zip"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( | ||||
| def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself( | ||||
|     file_name: str, expected_value: FileType | ||||
| ): | ||||
|     """Does not work for all types, in particular: | ||||
| @ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ | ||||
|     - ORG is identified as TXT | ||||
|     - RST is identified as TXT | ||||
|     """ | ||||
|     # -- disable strategy #1 by not asserting a content_type in the call -- | ||||
|     # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- | ||||
|     # -- disable content-type strategy by not asserting a content_type in the call -- | ||||
|     # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute -- | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     assert detect_filetype(file=file) is expected_value | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     "guessed_mime_type", | ||||
|     [ | ||||
|         "application/msword", | ||||
|         "application/vnd.ms-excel", | ||||
|         "application/vnd.ms-outlook", | ||||
|         "application/vnd.ms-powerpoint", | ||||
|         "application/x-ole-storage", | ||||
|         "anything/else", | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type( | ||||
|     file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc. | ||||
| 
 | ||||
|     It's better than that actually, the OLE differentiator will get the right file-type for any DOC, | ||||
|     PPT, XLS, or MSG file, regardless of guessed MIME-type. | ||||
|     """ | ||||
|     ctx_mime_type_.return_value = guessed_mime_type | ||||
|     # -- disable strategy 3 by not providing a file-name source -- | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     # -- disable strategy 1 by not asserting a content-type -- | ||||
|     file_type = detect_filetype(file=file) | ||||
| 
 | ||||
|     ctx_mime_type_.assert_called_with() | ||||
|     assert file_type is expected_value | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("filename", "mime_type", "expected"), | ||||
|     [ | ||||
|         ("fake.doc", "application/vnd.ms-excel", FileType.DOC), | ||||
|         ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT), | ||||
|         ("tests-example.xls", "application/msword", FileType.XLS), | ||||
|         ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG), | ||||
|     ], | ||||
| ) | ||||
| def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected): | ||||
|     def _guess_mime(*args, **kwargs): | ||||
|         return mime_type | ||||
| 
 | ||||
|     with patch("filetype.guess_mime", _guess_mime): | ||||
|         detect_filetype(example_doc_path(filename)) == expected | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         # -- `filetype` lib recognizes all these binary file-types -- | ||||
|         (FileType.BMP, "img/bmp_24.bmp"), | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.EPUB, "winter-sports.epub"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic"), | ||||
|         (FileType.JPG, "img/example.jpg"), | ||||
|         (FileType.ODT, "simple.odt"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.RTF, "fake-doc.rtf"), | ||||
|         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), | ||||
|         (FileType.WAV, "CantinaBand3.wav"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|         (FileType.ZIP, "simple.zip"), | ||||
|         # -- but it doesn't recognize textual file-types at all -- | ||||
|         (FileType.UNK, "stanley-cups.csv"), | ||||
| @ -435,11 +263,9 @@ def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, ex | ||||
|         (FileType.UNK, "stanley-cups.tsv"), | ||||
|         (FileType.UNK, "norwich-city.txt"), | ||||
|         (FileType.UNK, "factbook.xml"), | ||||
|         # -- and it doesn't recognize MSG files -- | ||||
|         (FileType.UNK, "fake-email.msg"), | ||||
|     ], | ||||
| ) | ||||
| def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( | ||||
| def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable( | ||||
|     file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool | ||||
| ): | ||||
|     """File-type is detected using `filetype` library when libmagic is not available. | ||||
| @ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab | ||||
|     `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), | ||||
|     but doesn't even try to guess textual file-types. | ||||
|     """ | ||||
|     # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- | ||||
|     # -- disable detection by extension by passing file-like object with no `.name` attribute -- | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
|     # -- simulate libmagic is not available -- | ||||
| @ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( | ||||
| 
 | ||||
| 
 | ||||
| # ================================================================================================ | ||||
| # STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE | ||||
| # STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE | ||||
| # ================================================================================================ | ||||
| 
 | ||||
| 
 | ||||
| @ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( | ||||
|     [ | ||||
|         (FileType.BMP, "img/bmp_24.bmp"), | ||||
|         (FileType.CSV, "stanley-cups.csv"), | ||||
|         (FileType.DOC, "simple.doc"), | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.EML, "eml/fake-email.eml"), | ||||
|         (FileType.EPUB, "winter-sports.epub"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic"), | ||||
|         (FileType.HTML, "example-10k-1p.html"), | ||||
|         (FileType.JPG, "img/example.jpg"), | ||||
|         (FileType.JSON, "spring-weather.html.json"), | ||||
|         (FileType.MD, "README.md"), | ||||
|         (FileType.MSG, "fake-email.msg"), | ||||
|         (FileType.ODT, "simple.odt"), | ||||
|         (FileType.ORG, "README.org"), | ||||
|         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), | ||||
|         (FileType.PNG, "img/DA-1p.png"), | ||||
|         (FileType.PPT, "fake-power-point.ppt"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.RST, "README.rst"), | ||||
|         (FileType.RTF, "fake-doc.rtf"), | ||||
|         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), | ||||
|         (FileType.TSV, "stanley-cups.tsv"), | ||||
|         (FileType.TXT, "norwich-city.txt"), | ||||
|         (FileType.WAV, "CantinaBand3.wav"), | ||||
|         (FileType.XLS, "tests-example.xls"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|         (FileType.XML, "factbook.xml"), | ||||
|         (FileType.ZIP, "simple.zip"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( | ||||
| def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type( | ||||
|     file_name: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     # -- disable strategy #2 by making libmagic always guess `None` -- | ||||
| @ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name", "mime_type"), | ||||
|     [ | ||||
|         (FileType.BMP, "img/bmp_24.bmp", "application/zip"), | ||||
|         (FileType.DOC, "simple.doc", None), | ||||
|         (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"), | ||||
|         (FileType.MSG, "fake-email.msg", "application/octet-stream"), | ||||
|         (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"), | ||||
|         (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( | ||||
| @ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( | ||||
| # ================================================================================================ | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type]) | ||||
| def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str): | ||||
|     file_path = example_doc_path("stanley-cups.csv") | ||||
|     assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) | ||||
| @pytest.mark.parametrize("extension", [".html", ".htm"]) | ||||
| def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( | ||||
| @ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi | ||||
|     assert file_type is FileType.HTML | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "mime_type", | ||||
|     [ | ||||
|         "application/octet-stream", | ||||
|         "application/zip", | ||||
|         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|     ], | ||||
| ) | ||||
| @pytest.mark.parametrize( | ||||
|     ("expected_value", "file_name"), | ||||
|     [ | ||||
|         (FileType.DOCX, "simple.docx"), | ||||
|         (FileType.PPTX, "fake-power-point.pptx"), | ||||
|         (FileType.XLSX, "stanley-cups.xlsx"), | ||||
|         (FileType.ZIP, "simple.zip"), | ||||
|     ], | ||||
| ) | ||||
| def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office( | ||||
|     mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock | ||||
| ): | ||||
|     ctx_mime_type_.return_value = mime_type | ||||
|     # -- disable extension-based strategy #3 -- | ||||
|     with open(example_doc_path(file_name), "rb") as f: | ||||
|         file = io.BytesIO(f.read()) | ||||
| 
 | ||||
|     file_type = detect_filetype(file=file) | ||||
| 
 | ||||
|     ctx_mime_type_.assert_called_with() | ||||
|     assert file_type is expected_value | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("mime_type", "file_name"), | ||||
|     [ | ||||
| @ -1000,29 +787,8 @@ class Describe_FileTypeDetectionContext: | ||||
|         return property_mock(request, _FileTypeDetectionContext, "mime_type") | ||||
| 
 | ||||
| 
 | ||||
| class Describe_OleFileDifferentiator: | ||||
|     """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`.""" | ||||
| 
 | ||||
|     # -- .applies() --------------------------------------------- | ||||
| 
 | ||||
|     def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): | ||||
|         """The constructor determines whether this differentiator is applicable. | ||||
| 
 | ||||
|         It returns an instance only when differentiating a CFBF file-type is required, which it | ||||
|         judges by inspecting the initial bytes of the file for the CFBF magic-bytes. | ||||
|         """ | ||||
|         ctx = _FileTypeDetectionContext(example_doc_path("simple.doc")) | ||||
| 
 | ||||
|         differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar") | ||||
| 
 | ||||
|         assert differentiator is not None | ||||
|         assert isinstance(differentiator, _OleFileDifferentiator) | ||||
| 
 | ||||
|     def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self): | ||||
|         ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub")) | ||||
|         assert _OleFileDifferentiator.applies(ctx, "application/epub") is None | ||||
| 
 | ||||
|     # -- .file_type --------------------------------------------- | ||||
| class Describe_OleFileDetector: | ||||
|     """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`.""" | ||||
| 
 | ||||
|     @pytest.mark.parametrize( | ||||
|         ("file_name", "expected_value"), | ||||
| @ -1034,59 +800,15 @@ class Describe_OleFileDifferentiator: | ||||
|             ("README.org", None), | ||||
|         ], | ||||
|     ) | ||||
|     def it_distinguishes_the_file_type_of_applicable_OLE_files( | ||||
|     def it_distinguishes_the_file_type_of_applicable_CFB_files( | ||||
|         self, file_name: str, expected_value: FileType | None | ||||
|     ): | ||||
|         # -- no file-name available, just to make sure we're not relying on an extension -- | ||||
|         with open(example_doc_path(file_name), "rb") as f: | ||||
|             file = io.BytesIO(f.read()) | ||||
|         ctx = _FileTypeDetectionContext(file=file) | ||||
|         differentiator = _OleFileDifferentiator(ctx) | ||||
| 
 | ||||
|         assert differentiator.file_type is expected_value | ||||
| 
 | ||||
|     @pytest.mark.parametrize( | ||||
|         ("file_name", "expected_value"), | ||||
|         [ | ||||
|             ("simple.doc", FileType.DOC), | ||||
|             ("fake-power-point.ppt", FileType.PPT), | ||||
|             ("tests-example.xls", FileType.XLS), | ||||
|             ("fake-email.msg", FileType.MSG), | ||||
|         ], | ||||
|     ) | ||||
|     def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content( | ||||
|         self, file_name: str, expected_value: FileType | None | ||||
|     ): | ||||
|         # -- no file-name available, just to make sure we're not relying on an extension -- | ||||
|         with open(example_doc_path(file_name), "rb") as f: | ||||
|             file = io.BytesIO(f.read()) | ||||
|         ctx = _FileTypeDetectionContext(file=file) | ||||
|         differentiator = _OleFileDifferentiator(ctx) | ||||
| 
 | ||||
|         assert differentiator._check_ole_file_type(ctx) is expected_value | ||||
| 
 | ||||
|     def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( | ||||
|         self, guess_mime_: Mock | ||||
|     ): | ||||
|         guess_mime_.return_value = None | ||||
|         # -- no file-name available, just to make sure we're not relying on an extension -- | ||||
|         with open(example_doc_path("fake-email.msg"), "rb") as f: | ||||
|             file = io.BytesIO(f.read()) | ||||
|         ctx = _FileTypeDetectionContext(file=file) | ||||
|         differentiator = _OleFileDifferentiator(ctx) | ||||
|         # -- force method to return None to trigger the mime type being guessed | ||||
|         differentiator._check_ole_file_type = lambda ctx: None | ||||
| 
 | ||||
|         file_type = differentiator.file_type | ||||
| 
 | ||||
|         guess_mime_.assert_called_once_with(file) | ||||
|         assert file_type is None | ||||
| 
 | ||||
|     # -- fixtures -------------------------------------------------------------------------------- | ||||
| 
 | ||||
|     @pytest.fixture | ||||
|     def guess_mime_(self, request: FixtureRequest): | ||||
|         return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime") | ||||
|         assert _OleFileDetector.file_type(ctx) is expected_value | ||||
| 
 | ||||
| 
 | ||||
| class Describe_TextFileDifferentiator: | ||||
| @ -1164,33 +886,15 @@ class Describe_TextFileDifferentiator: | ||||
|         assert differentiator._is_json is expected_value | ||||
| 
 | ||||
| 
 | ||||
| class Describe_ZipFileDifferentiator: | ||||
|     """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" | ||||
| 
 | ||||
|     # -- .applies() --------------------------------------------- | ||||
| 
 | ||||
|     def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): | ||||
|         """The constructor determines whether this differentiator is applicable. | ||||
| 
 | ||||
|         It returns an instance only when differentiating a zip file-type is required, which it can | ||||
|         judge from the mime-type provided by the context (`ctx`). | ||||
|         """ | ||||
|         ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) | ||||
| 
 | ||||
|         differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip") | ||||
| 
 | ||||
|         assert isinstance(differentiator, _ZipFileDifferentiator) | ||||
| 
 | ||||
|     def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self): | ||||
|         ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) | ||||
|         assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None | ||||
| 
 | ||||
|     # -- .file_type --------------------------------------------- | ||||
| class Describe_ZipFileDetector: | ||||
|     """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`.""" | ||||
| 
 | ||||
|     @pytest.mark.parametrize( | ||||
|         ("file_name", "expected_value"), | ||||
|         [ | ||||
|             ("simple.docx", FileType.DOCX), | ||||
|             ("winter-sports.epub", FileType.EPUB), | ||||
|             ("simple.odt", FileType.ODT), | ||||
|             ("picture.pptx", FileType.PPTX), | ||||
|             ("vodafone.xlsx", FileType.XLSX), | ||||
|             ("simple.zip", FileType.ZIP), | ||||
| @ -1201,6 +905,4 @@ class Describe_ZipFileDifferentiator: | ||||
|         self, file_name: str, expected_value: FileType | None | ||||
|     ): | ||||
|         ctx = _FileTypeDetectionContext(example_doc_path(file_name)) | ||||
|         differentiator = _ZipFileDifferentiator(ctx) | ||||
| 
 | ||||
|         assert differentiator.file_type is expected_value | ||||
|         assert _ZipFileDetector.file_type(ctx) is expected_value | ||||
|  | ||||
| @ -1 +1 @@ | ||||
| __version__ = "0.16.12-dev2"  # pragma: no cover | ||||
| __version__ = "0.16.12-dev3"  # pragma: no cover | ||||
|  | ||||
| @ -51,7 +51,11 @@ from unstructured.partition.common.common import add_element_metadata, exactly_o | ||||
| from unstructured.partition.common.metadata import set_element_hierarchy | ||||
| from unstructured.utils import get_call_args_applying_defaults, lazyproperty | ||||
| 
 | ||||
| LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) | ||||
| try: | ||||
|     importlib.import_module("magic") | ||||
|     LIBMAGIC_AVAILABLE = True | ||||
| except ImportError: | ||||
|     LIBMAGIC_AVAILABLE = False  # pyright: ignore[reportConstantRedefinition] | ||||
| 
 | ||||
| 
 | ||||
| def detect_filetype( | ||||
| @ -133,43 +137,57 @@ class _FileTypeDetector: | ||||
|     @property | ||||
|     def _file_type(self) -> FileType: | ||||
|         """FileType member corresponding to this document source.""" | ||||
|         # -- strategy 1: use content-type asserted by caller -- | ||||
|         # -- An explicit content-type most commonly asserted by the client/SDK and is therefore | ||||
|         # -- inherently unreliable. On the other hand, binary file-types can be detected with 100% | ||||
|         # -- accuracy. So start with binary types and only then consider an asserted content-type, | ||||
|         # -- generally as a last resort. | ||||
| 
 | ||||
|         # -- strategy 1: most binary types can be detected with 100% accuracy -- | ||||
|         if file_type := self._known_binary_file_type: | ||||
|             return file_type | ||||
| 
 | ||||
|         # -- strategy 2: use content-type asserted by caller -- | ||||
|         if file_type := self._file_type_from_content_type: | ||||
|             return file_type | ||||
| 
 | ||||
|         # -- strategy 2: guess MIME-type using libmagic and use that -- | ||||
|         # -- strategy 3: guess MIME-type using libmagic and use that -- | ||||
|         if file_type := self._file_type_from_guessed_mime_type: | ||||
|             return file_type | ||||
| 
 | ||||
|         # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- | ||||
|         # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX -- | ||||
|         if file_type := self._file_type_from_file_extension: | ||||
|             return file_type | ||||
| 
 | ||||
|         # -- strategy 4: give up and report FileType.UNK -- | ||||
|         # -- strategy 5: give up and report FileType.UNK -- | ||||
|         return FileType.UNK | ||||
| 
 | ||||
|     # == STRATEGIES ============================================================ | ||||
| 
 | ||||
|     @property | ||||
|     def _file_type_from_content_type(self) -> FileType | None: | ||||
|         """Map passed content-type argument to a file-type, subject to certain rules.""" | ||||
|         content_type = self._ctx.content_type | ||||
|     def _known_binary_file_type(self) -> FileType | None: | ||||
|         """Detect file-type for binary types we can positively detect.""" | ||||
|         if file_type := _OleFileDetector.file_type(self._ctx): | ||||
|             return file_type | ||||
| 
 | ||||
|         self._ctx.rule_out_cfb_content_types() | ||||
| 
 | ||||
|         if file_type := _ZipFileDetector.file_type(self._ctx): | ||||
|             return file_type | ||||
| 
 | ||||
|         self._ctx.rule_out_zip_content_types() | ||||
| 
 | ||||
|         # -- when no content-type was asserted by caller, this strategy is not applicable -- | ||||
|         if not content_type: | ||||
|         return None | ||||
| 
 | ||||
|         # -- OLE-based file-format content_type values are sometimes unreliable. These are | ||||
|         # -- DOC, PPT, XLS, and MSG. | ||||
|         if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type): | ||||
|             return differentiator.file_type | ||||
|     @property | ||||
|     def _file_type_from_content_type(self) -> FileType | None: | ||||
|         """Map passed content-type argument to a file-type, subject to certain rules.""" | ||||
| 
 | ||||
|         # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- | ||||
|         if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): | ||||
|             return differentiator.file_type | ||||
|         # -- when no content-type was asserted by caller, this strategy is not applicable -- | ||||
|         if not self._ctx.content_type: | ||||
|             return None | ||||
| 
 | ||||
|         # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- | ||||
|         return FileType.from_mime_type(content_type) | ||||
|         return FileType.from_mime_type(self._ctx.content_type) | ||||
| 
 | ||||
|     @property | ||||
|     def _file_type_from_guessed_mime_type(self) -> FileType | None: | ||||
| @ -188,24 +206,12 @@ class _FileTypeDetector: | ||||
|         if mime_type is None: | ||||
|             return None | ||||
| 
 | ||||
|         if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type): | ||||
|             return differentiator.file_type | ||||
| 
 | ||||
|         if mime_type.endswith("xml"): | ||||
|             return FileType.HTML if extension in (".html", ".htm") else FileType.XML | ||||
| 
 | ||||
|         if differentiator := _TextFileDifferentiator.applies(self._ctx): | ||||
|             return differentiator.file_type | ||||
| 
 | ||||
|         # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+ | ||||
|         # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT | ||||
|         # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and | ||||
|         # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are | ||||
|         # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type | ||||
|         # -- is actually a PPTX file etc. | ||||
|         if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type): | ||||
|             return differentiator.file_type | ||||
| 
 | ||||
|         # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- | ||||
|         if self._ctx.has_code_mime_type: | ||||
|             return FileType.TXT | ||||
| @ -214,14 +220,8 @@ class _FileTypeDetector: | ||||
|             return FileType.EMPTY | ||||
| 
 | ||||
|         # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- | ||||
|         if file_type := FileType.from_mime_type(mime_type): | ||||
|             return file_type | ||||
| 
 | ||||
|         logger.warning( | ||||
|             f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is" | ||||
|             f" {mime_type!r}. This file type is not currently supported in unstructured.", | ||||
|         ) | ||||
|         return None | ||||
|         file_type = FileType.from_mime_type(mime_type) | ||||
|         return file_type if file_type != FileType.UNK else None | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def _file_type_from_file_extension(self) -> FileType | None: | ||||
| @ -236,6 +236,9 @@ class _FileTypeDetector: | ||||
| class _FileTypeDetectionContext: | ||||
|     """Provides all arguments to auto-file detection and values derived from them. | ||||
| 
 | ||||
|     NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should | ||||
|     not be assumed to be a constant value across those calls. | ||||
| 
 | ||||
|     This keeps computation of derived values out of the file-detection code but more importantly | ||||
|     allows the main filetype-detector to pass the full context to any delegates without coupling | ||||
|     itself to which values it might need. | ||||
| @ -276,7 +279,7 @@ class _FileTypeDetectionContext: | ||||
|         self._validate() | ||||
|         return self | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     @property | ||||
|     def content_type(self) -> str | None: | ||||
|         """MIME-type asserted by caller; not based on inspection of file by this process. | ||||
| 
 | ||||
| @ -284,6 +287,8 @@ class _FileTypeDetectionContext: | ||||
|         present on the response. These are often ambiguous and sometimes just wrong so get some | ||||
|         further verification. All lower-case when not `None`. | ||||
|         """ | ||||
|         # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a | ||||
|         # -- `@lazyproperty`. | ||||
|         return self._content_type.lower() if self._content_type else None | ||||
| 
 | ||||
|     @lazyproperty | ||||
| @ -327,12 +332,6 @@ class _FileTypeDetectionContext: | ||||
| 
 | ||||
|         return os.path.realpath(file_path) if os.path.islink(file_path) else file_path | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def is_zipfile(self) -> bool: | ||||
|         """True when file is a Zip archive.""" | ||||
|         with self.open() as file: | ||||
|             return zipfile.is_zipfile(file) | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def has_code_mime_type(self) -> bool: | ||||
|         """True when `mime_type` plausibly indicates a programming language source-code file.""" | ||||
| @ -347,9 +346,27 @@ class _FileTypeDetectionContext: | ||||
| 
 | ||||
|         return any( | ||||
|             lang in mime_type | ||||
|             for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() | ||||
|             for lang in [ | ||||
|                 "c#", | ||||
|                 "c++", | ||||
|                 "cpp", | ||||
|                 "csharp", | ||||
|                 "java", | ||||
|                 "javascript", | ||||
|                 "php", | ||||
|                 "python", | ||||
|                 "ruby", | ||||
|                 "swift", | ||||
|                 "typescript", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def is_zipfile(self) -> bool: | ||||
|         """True when file is a Zip archive.""" | ||||
|         with self.open() as file: | ||||
|             return zipfile.is_zipfile(file) | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def mime_type(self) -> str | None: | ||||
|         """The best MIME-type we can get from `magic` (or `filetype` package). | ||||
| @ -401,6 +418,38 @@ class _FileTypeDetectionContext: | ||||
|             file.seek(0) | ||||
|             yield file | ||||
| 
 | ||||
|     def rule_out_cfb_content_types(self) -> None: | ||||
|         """Invalidate content-type when a legacy MS-Office file-type is asserted. | ||||
| 
 | ||||
|         Used before returning `None`; at that point we know the file is not one of these formats | ||||
|         so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should | ||||
|         not be used as a fallback later in the detection process. | ||||
|         """ | ||||
|         if FileType.from_mime_type(self._content_type) in ( | ||||
|             FileType.DOC, | ||||
|             FileType.MSG, | ||||
|             FileType.PPT, | ||||
|             FileType.XLS, | ||||
|         ): | ||||
|             self._content_type = None | ||||
| 
 | ||||
|     def rule_out_zip_content_types(self) -> None: | ||||
|         """Invalidate content-type when an MS-Office 2007+ file-type is asserted. | ||||
| 
 | ||||
|         Used before returning `None`; at that point we know the file is not one of these formats | ||||
|         so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should | ||||
|         not be used as a fallback later in the detection process. | ||||
|         """ | ||||
|         if FileType.from_mime_type(self._content_type) in ( | ||||
|             FileType.DOCX, | ||||
|             FileType.EPUB, | ||||
|             FileType.ODT, | ||||
|             FileType.PPTX, | ||||
|             FileType.XLSX, | ||||
|             FileType.ZIP, | ||||
|         ): | ||||
|             self._content_type = None | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def text_head(self) -> str: | ||||
|         """The initial characters of the text file for use with text-format differentiation. | ||||
| @ -440,27 +489,23 @@ class _FileTypeDetectionContext: | ||||
|             raise ValueError("either `file_path` or `file` argument must be provided") | ||||
| 
 | ||||
| 
 | ||||
| class _OleFileDifferentiator: | ||||
|     """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be. | ||||
| class _OleFileDetector: | ||||
|     """Detect and differentiate a CFB file, aka. "OLE" file. | ||||
| 
 | ||||
|     Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office | ||||
|     files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as | ||||
|     `"application/x-ole-storage"` which is true but too not specific enough for partitioning | ||||
|     purposes. | ||||
|     Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office | ||||
|     files (DOC, PPT, XLS) as well as for Outlook MSG files. | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, ctx: _FileTypeDetectionContext): | ||||
|         self._ctx = ctx | ||||
| 
 | ||||
|     @classmethod | ||||
|     def applies( | ||||
|         cls, ctx: _FileTypeDetectionContext, mime_type: str | ||||
|     ) -> _OleFileDifferentiator | None: | ||||
|         """Constructs an instance, but only if this differentiator applies for `mime_type`.""" | ||||
|         return cls(ctx) if cls._is_ole_file(ctx) else None | ||||
|     def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: | ||||
|         """Specific file-type when file is a CFB file, `None` otherwise.""" | ||||
|         return cls(ctx)._file_type | ||||
| 
 | ||||
|     @property | ||||
|     def file_type(self) -> FileType | None: | ||||
|     def _file_type(self) -> FileType | None: | ||||
|         """Differentiated file-type for Microsoft Compound File Binary Format (CFBF). | ||||
| 
 | ||||
|         Returns one of: | ||||
| @ -468,34 +513,27 @@ class _OleFileDifferentiator: | ||||
|         - `FileType.PPT` | ||||
|         - `FileType.XLS` | ||||
|         - `FileType.MSG` | ||||
|         - `None` when the file is not one of these. | ||||
|         """ | ||||
|         # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return | ||||
|         # -- `None` to trigger fall-back to next strategy. | ||||
|         if not self._is_ole_file(self._ctx): | ||||
|         # -- all CFB files share common magic number, start with that -- | ||||
|         if not self._is_ole_file: | ||||
|             return None | ||||
| 
 | ||||
|         # -- check storage contents of the ole file for file type markers | ||||
|         if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None: | ||||
|         # -- check storage contents of the ole file for file-type specific stream names -- | ||||
|         if (ole_file_type := self._ole_file_type) is not None: | ||||
|             return ole_file_type | ||||
| 
 | ||||
|         # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it | ||||
|         # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always | ||||
|         # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we | ||||
|         # -- rely on filename-extension to identify those. | ||||
|         return None | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def _is_ole_file(self) -> bool: | ||||
|         """True when file has CFB magic first 8 bytes.""" | ||||
|         with self._ctx.open() as file: | ||||
|             mime_type = ft.guess_mime(file) | ||||
| 
 | ||||
|         return FileType.from_mime_type(mime_type) if mime_type else None | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: | ||||
|         """True when file has CFBF magic first 8 bytes.""" | ||||
|         with ctx.open() as file: | ||||
|             return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: | ||||
|         with ctx.open() as f: | ||||
|     @lazyproperty | ||||
|     def _ole_file_type(self) -> FileType | None: | ||||
|         with self._ctx.open() as f: | ||||
|             ole = OleFileIO(f)  # pyright: ignore[reportUnknownVariableType] | ||||
|             root_storage = Storage.from_ole(ole)  # pyright: ignore[reportUnknownMemberType] | ||||
| 
 | ||||
| @ -537,7 +575,20 @@ class _TextFileDifferentiator: | ||||
|         """ | ||||
|         extension = self._ctx.extension | ||||
| 
 | ||||
|         if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): | ||||
|         if extension in [ | ||||
|             ".csv", | ||||
|             ".eml", | ||||
|             ".html", | ||||
|             ".json", | ||||
|             ".markdown", | ||||
|             ".md", | ||||
|             ".org", | ||||
|             ".p7s", | ||||
|             ".rst", | ||||
|             ".rtf", | ||||
|             ".tab", | ||||
|             ".tsv", | ||||
|         ]: | ||||
|             return FileType.from_extension(extension) or FileType.TXT | ||||
| 
 | ||||
|         # NOTE(crag): for older versions of the OS libmagic package, such as is currently | ||||
| @ -616,40 +667,28 @@ class _TextFileDifferentiator: | ||||
|             return False | ||||
| 
 | ||||
| 
 | ||||
| class _ZipFileDifferentiator: | ||||
|     """Refine a Zip-packaged file-type that may be ambiguous or swapped.""" | ||||
| class _ZipFileDetector: | ||||
|     """Detect and differentiate a Zip-archive file.""" | ||||
| 
 | ||||
|     def __init__(self, ctx: _FileTypeDetectionContext): | ||||
|         self._ctx = ctx | ||||
| 
 | ||||
|     @classmethod | ||||
|     def applies( | ||||
|         cls, ctx: _FileTypeDetectionContext, mime_type: str | ||||
|     ) -> _ZipFileDifferentiator | None: | ||||
|         """Constructs an instance, but only if this differentiator applies for `mime_type`. | ||||
|     def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: | ||||
|         """Most specific file-type available when file is a Zip file, `None` otherwise. | ||||
| 
 | ||||
|         Separate `mime_type` argument allows it to be applied to either asserted content-type or | ||||
|         guessed mime-type. | ||||
|         MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even | ||||
|         when we can tell it's a Zip file, so later strategies can have a crack at it. In | ||||
|         particular, ODT and EPUB files are Zip archives but are not detected here. | ||||
|         """ | ||||
|         return ( | ||||
|             cls(ctx) | ||||
|             if mime_type | ||||
|             in ( | ||||
|                 "application/octet-stream", | ||||
|                 "application/zip", | ||||
|                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|                 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | ||||
|                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|             ) | ||||
|             else None | ||||
|         ) | ||||
|         return cls(ctx)._file_type | ||||
| 
 | ||||
|     @lazyproperty | ||||
|     def file_type(self) -> FileType | None: | ||||
|     def _file_type(self) -> FileType | None: | ||||
|         """Differentiated file-type for a Zip archive. | ||||
| 
 | ||||
|         Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, | ||||
|         `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. | ||||
|         Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies, | ||||
|         `None` otherwise. | ||||
|         """ | ||||
|         if not self._ctx.is_zipfile: | ||||
|             return None | ||||
| @ -657,20 +696,23 @@ class _ZipFileDifferentiator: | ||||
|         with self._ctx.open() as file: | ||||
|             zip = zipfile.ZipFile(file) | ||||
| 
 | ||||
|             # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx | ||||
|             # extension. If the MIME type is application/octet-stream, we check if it's a | ||||
|             # .docx/.xlsx file by looking for expected filenames within the zip file. | ||||
|             filenames = [f.filename for f in zip.filelist] | ||||
|             filenames = zip.namelist() | ||||
| 
 | ||||
|             if all(f in filenames for f in ("word/document.xml",)): | ||||
|             if "word/document.xml" in filenames: | ||||
|                 return FileType.DOCX | ||||
| 
 | ||||
|             if all(f in filenames for f in ("xl/workbook.xml",)): | ||||
|             if "xl/workbook.xml" in filenames: | ||||
|                 return FileType.XLSX | ||||
| 
 | ||||
|             if all(f in filenames for f in ("ppt/presentation.xml",)): | ||||
|             if "ppt/presentation.xml" in filenames: | ||||
|                 return FileType.PPTX | ||||
| 
 | ||||
|             # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- | ||||
|             if "mimetype" in filenames: | ||||
|                 with zip.open("mimetype") as f: | ||||
|                     mime_type = f.read().decode("utf-8").strip() | ||||
|                     return FileType.from_mime_type(mime_type) | ||||
| 
 | ||||
|         return FileType.ZIP | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Steve Canny
						Steve Canny