mirror of
https://github.com/microsoft/markitdown.git
synced 2025-06-26 22:00:21 +00:00
Chore: Make linter happy (#1256)
* refactor: remove unused imports * fix: replace NotImplemented with NotImplementedError * refactor: resolve E722 (do not use bare 'except') * refactor: remove unused variable * refactor: remove unused imports * refactor: ignore unused imports that will be used in the future * refactor: resolve W293 (blank line contains whitespace) * refactor: resolve F541 (f-string is missing placeholders) --------- Co-authored-by: afourney <adamfo@microsoft.com>
This commit is contained in:
parent
39e7252940
commit
cb421cf9ea
@ -1,5 +1,4 @@
|
|||||||
import sys
|
import sys
|
||||||
from typing import Any
|
|
||||||
from mcp.server.fastmcp import FastMCP
|
from mcp.server.fastmcp import FastMCP
|
||||||
from starlette.applications import Starlette
|
from starlette.applications import Starlette
|
||||||
from mcp.server.sse import SseServerTransport
|
from mcp.server.sse import SseServerTransport
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
#!/usr/bin/env python3 -m pytest
|
#!/usr/bin/env python3 -m pytest
|
||||||
import os
|
import os
|
||||||
import pytest
|
|
||||||
|
|
||||||
from markitdown import MarkItDown, StreamInfo
|
from markitdown import MarkItDown, StreamInfo
|
||||||
from markitdown_sample_plugin import RtfConverter
|
from markitdown_sample_plugin import RtfConverter
|
||||||
|
@ -4,7 +4,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
import codecs
|
||||||
import locale
|
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from .__about__ import __version__
|
from .__about__ import __version__
|
||||||
@ -34,13 +33,13 @@ def main():
|
|||||||
OR
|
OR
|
||||||
|
|
||||||
markitdown < example.pdf
|
markitdown < example.pdf
|
||||||
|
|
||||||
OR to save to a file use
|
OR to save to a file use
|
||||||
|
|
||||||
markitdown example.pdf -o example.md
|
markitdown example.pdf -o example.md
|
||||||
|
|
||||||
OR
|
OR
|
||||||
|
|
||||||
markitdown example.pdf > example.md
|
markitdown example.pdf > example.md
|
||||||
"""
|
"""
|
||||||
).strip(),
|
).strip(),
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
import os
|
from typing import Any, BinaryIO, Optional
|
||||||
import tempfile
|
|
||||||
from warnings import warn
|
|
||||||
from typing import Any, Union, BinaryIO, Optional, List
|
|
||||||
from ._stream_info import StreamInfo
|
from ._stream_info import StreamInfo
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
import copy
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
|
||||||
import warnings
|
|
||||||
import traceback
|
import traceback
|
||||||
import io
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@ -547,7 +544,7 @@ class MarkItDown:
|
|||||||
# Sanity check -- make sure the cur_pos is still the same
|
# Sanity check -- make sure the cur_pos is still the same
|
||||||
assert (
|
assert (
|
||||||
cur_pos == file_stream.tell()
|
cur_pos == file_stream.tell()
|
||||||
), f"File stream position should NOT change between guess iterations"
|
), "File stream position should NOT change between guess iterations"
|
||||||
|
|
||||||
_kwargs = {k: v for k, v in kwargs.items()}
|
_kwargs = {k: v for k, v in kwargs.items()}
|
||||||
|
|
||||||
@ -614,7 +611,7 @@ class MarkItDown:
|
|||||||
|
|
||||||
# Nothing can handle it!
|
# Nothing can handle it!
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||||
|
@ -272,7 +272,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
if FUNC.get(t):
|
if FUNC.get(t):
|
||||||
latex_chars.append(FUNC[t])
|
latex_chars.append(FUNC[t])
|
||||||
else:
|
else:
|
||||||
raise NotImplemented("Not support func %s" % t)
|
raise NotImplementedError("Not support func %s" % t)
|
||||||
else:
|
else:
|
||||||
latex_chars.append(t)
|
latex_chars.append(t)
|
||||||
t = BLANK.join(latex_chars)
|
t = BLANK.join(latex_chars)
|
||||||
@ -316,7 +316,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||||
if not latex_s:
|
if not latex_s:
|
||||||
raise NotImplemented("Not support lim %s" % t_dict["e"])
|
raise NotImplementedError("Not support lim %s" % t_dict["e"])
|
||||||
else:
|
else:
|
||||||
return latex_s.format(lim=t_dict.get("lim"))
|
return latex_s.format(lim=t_dict.get("lim"))
|
||||||
|
|
||||||
|
@ -147,7 +147,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
|
|||||||
updated_content = _pre_process_math(content)
|
updated_content = _pre_process_math(content)
|
||||||
# In the future, if there are more pre-processing steps, they can be added here
|
# In the future, if there are more pre-processing steps, they can be added here
|
||||||
zip_output.writestr(name, updated_content)
|
zip_output.writestr(name, updated_content)
|
||||||
except:
|
except Exception:
|
||||||
# If there is an error in processing the content, write the original content
|
# If there is an error in processing the content, write the original content
|
||||||
zip_output.writestr(name, content)
|
zip_output.writestr(name, content)
|
||||||
else:
|
else:
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import io
|
from typing import Any, BinaryIO
|
||||||
from typing import Any, BinaryIO, Optional
|
|
||||||
|
|
||||||
from ._exiftool import exiftool_metadata
|
from ._exiftool import exiftool_metadata
|
||||||
from ._transcribe_audio import transcribe_audio
|
from ._transcribe_audio import transcribe_audio
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
from urllib.parse import parse_qs, urlparse
|
from urllib.parse import parse_qs, urlparse
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
import sys
|
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
from charset_normalizer import from_bytes
|
from charset_normalizer import from_bytes
|
||||||
from ._html_converter import HtmlConverter
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from typing import BinaryIO, Any, List, Optional, Union
|
from typing import BinaryIO, Any, List
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException
|
||||||
|
|
||||||
# Try loading optional (but in this case, required) dependencies
|
# Try loading optional (but in this case, required) dependencies
|
||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
|
@ -4,7 +4,7 @@ from typing import BinaryIO, Any
|
|||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ from xml.dom.minidom import Document
|
|||||||
from typing import BinaryIO, Any, Dict, List
|
from typing import BinaryIO, Any, Dict, List
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
@ -1,10 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
import locale
|
import locale
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
import os
|
|
||||||
import warnings
|
|
||||||
from typing import BinaryIO, Any, Union
|
from typing import BinaryIO, Any, Union
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,8 +50,6 @@ class IpynbConverter(DocumentConverter):
|
|||||||
**kwargs: Any, # Options to pass to the converter
|
**kwargs: Any, # Options to pass to the converter
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
# Parse and convert the notebook
|
# Parse and convert the notebook
|
||||||
result = None
|
|
||||||
|
|
||||||
encoding = stream_info.charset or "utf-8"
|
encoding = stream_info.charset or "utf-8"
|
||||||
notebook_content = file_stream.read().decode(encoding=encoding)
|
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||||
return self._convert(json.loads(notebook_content))
|
return self._convert(json.loads(notebook_content))
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import BinaryIO, Any, Union
|
from typing import BinaryIO, Union
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
@ -4,7 +4,6 @@ import io
|
|||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
@ -9,7 +9,7 @@ from .._stream_info import StreamInfo
|
|||||||
# Save reporting of any exceptions for later
|
# Save reporting of any exceptions for later
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
import mammoth
|
import mammoth # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
_dependency_exc_info = sys.exc_info()
|
_dependency_exc_info = sys.exc_info()
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import bs4
|
import bs4
|
||||||
from typing import Any, BinaryIO, Optional
|
from typing import Any, BinaryIO
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
|
@ -10,14 +10,14 @@ from .._stream_info import StreamInfo
|
|||||||
_xlsx_dependency_exc_info = None
|
_xlsx_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import openpyxl
|
import openpyxl # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_xlsx_dependency_exc_info = sys.exc_info()
|
_xlsx_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
_xls_dependency_exc_info = None
|
_xls_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd # noqa: F811
|
||||||
import xlrd
|
import xlrd # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_xls_dependency_exc_info = sys.exc_info()
|
_xls_dependency_exc_info = sys.exc_info()
|
||||||
|
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import bs4
|
import bs4
|
||||||
from typing import Any, BinaryIO, Optional, Dict, List, Union
|
from typing import Any, BinaryIO, Dict, List, Union
|
||||||
from urllib.parse import parse_qs, urlparse, unquote
|
from urllib.parse import parse_qs, urlparse, unquote
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import sys
|
|
||||||
import zipfile
|
import zipfile
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
#!/usr/bin/env python3 -m pytest
|
#!/usr/bin/env python3 -m pytest
|
||||||
import subprocess
|
import subprocess
|
||||||
import pytest
|
|
||||||
from markitdown import __version__
|
from markitdown import __version__
|
||||||
|
|
||||||
# This file contains CLI tests that are not directly tested by the FileTestVectors.
|
# This file contains CLI tests that are not directly tested by the FileTestVectors.
|
||||||
@ -24,8 +23,8 @@ def test_invalid_flag() -> None:
|
|||||||
assert result.returncode != 0, f"CLI exited with error: {result.stderr}"
|
assert result.returncode != 0, f"CLI exited with error: {result.stderr}"
|
||||||
assert (
|
assert (
|
||||||
"unrecognized arguments" in result.stderr
|
"unrecognized arguments" in result.stderr
|
||||||
), f"Expected 'unrecognized arguments' to appear in STDERR"
|
), "Expected 'unrecognized arguments' to appear in STDERR"
|
||||||
assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR"
|
assert "SYNTAX" in result.stderr, "Expected 'SYNTAX' to appear in STDERR"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -19,13 +19,6 @@ else:
|
|||||||
FileTestVector,
|
FileTestVector,
|
||||||
)
|
)
|
||||||
|
|
||||||
from markitdown import (
|
|
||||||
MarkItDown,
|
|
||||||
UnsupportedFormatException,
|
|
||||||
FileConversionException,
|
|
||||||
StreamInfo,
|
|
||||||
)
|
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
) # Don't run these tests in CI
|
) # Don't run these tests in CI
|
||||||
@ -140,8 +133,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
|
|||||||
"""Test the conversion of a stream with no stream info."""
|
"""Test the conversion of a stream with no stream info."""
|
||||||
# Note: tmp_dir is not used here, but is needed to match the signature
|
# Note: tmp_dir is not used here, but is needed to match the signature
|
||||||
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
time.sleep(1) # Ensure we don't hit rate limits
|
time.sleep(1) # Ensure we don't hit rate limits
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
|
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
|
||||||
@ -191,7 +182,6 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
@ -3,7 +3,6 @@ import io
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import openai
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
|
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
|
||||||
@ -253,8 +252,6 @@ def test_file_uris() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def test_docx_comments() -> None:
|
def test_docx_comments() -> None:
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
# Test DOCX processing, with comments and setting style_map on init
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
result = markitdown_with_style_map.convert(
|
result = markitdown_with_style_map.convert(
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import pytest
|
import pytest
|
||||||
import codecs
|
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -14,8 +13,6 @@ else:
|
|||||||
|
|
||||||
from markitdown import (
|
from markitdown import (
|
||||||
MarkItDown,
|
MarkItDown,
|
||||||
UnsupportedFormatException,
|
|
||||||
FileConversionException,
|
|
||||||
StreamInfo,
|
StreamInfo,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -203,8 +200,6 @@ def test_convert_stream_keep_data_uris(test_vector):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
|
||||||
|
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
|
|
||||||
# General tests
|
# General tests
|
||||||
|
Loading…
x
Reference in New Issue
Block a user