Initial commit.

2025-12-27 06:42:15 +00:00 · 2024-11-13 13:00:01 -08:00 · 2024-11-13 13:00:01 -08:00 · f20c964f99
commit f20c964f99
parent 67fec84618
16 changed files with 3865 additions and 0 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,20 @@
+name: pre-commit
+on: [pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.x"
+
+      - name: Install pre-commit
+        run: |
+          pip install pre-commit
+          pre-commit install --install-hooks
+
+      - name: Run pre-commit
+        run: pre-commit run --all-files
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -0,0 +1,25 @@
+name: tests
+on: [pull_request]
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: |
+            3.10
+            3.11
+            3.12
+      - name: Set up pip cache
+        if: runner.os == 'Linux'
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
+          restore-keys: ${{ runner.os }}-pip-
+      - name: Install Hatch
+        run: pipx install hatch
+      - name: Run tests
+        run: hatch test
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,5 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 23.7.0 # Use the latest version of Black
+    hooks:
+      - id: black
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,76 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "markitdown"
+dynamic = ["version"]
+description = ''
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+keywords = []
+authors = [
+  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+  "beautifulsoup4",
+  "requests",
+  "mammoth",
+  "markdownify",
+  "numpy",
+  "python-pptx",
+  "pandas",
+  "openpyxl",
+  "pdfminer.six",
+  "puremagic",
+  "pydub",
+  "youtube-transcript-api",
+  "SpeechRecognition",
+  "pathvalidate",
+]
+
+[project.urls]
+Documentation = "https://github.com/microsoft/markitdown#readme"
+Issues = "https://github.com/microsoft/markitdown/issues"
+Source = "https://github.com/microsoft/markitdown"
+
+[tool.hatch.version]
+path = "src/markitdown/__about__.py"
+
+[tool.hatch.envs.types]
+extra-dependencies = [
+  "mypy>=1.0.0",
+]
+[tool.hatch.envs.types.scripts]
+check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
+
+[tool.coverage.run]
+source_pkgs = ["markitdown", "tests"]
+branch = true
+parallel = true
+omit = [
+  "src/markitdown/__about__.py",
+]
+
+[tool.coverage.paths]
+markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
+tests = ["tests", "*/markitdown/tests"]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = "0.0.1a1"
--- a/src/markitdown/init.py
+++ b/src/markitdown/init.py
@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+
+from ._markitdown import MarkItDown
+
+__all__ = [
+    "MarkItDown",
+]
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_blog.html
+++ b/tests/test_files/test_blog.html
--- a/tests/test_files/test_serp.html
+++ b/tests/test_files/test_serp.html
--- a/tests/test_files/test_wikipedia.html
+++ b/tests/test_files/test_wikipedia.html
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@ -0,0 +1,184 @@
+#!/usr/bin/env python3 -m pytest
+import io
+import os
+import shutil
+
+import pytest
+import requests
+
+from markitdown import MarkItDown
+
+skip_remote = os.environ.get("GITHUB_ACTIONS")  # Don't run these tests in CI
+skip_exiftool = shutil.which("exiftool") is None
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+JPG_TEST_EXIFTOOL = {
+    "Author": "AutoGen Authors",
+    "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "Description": "AutoGen enables diverse LLM-based applications",
+    "ImageSize": "1615x1967",
+    "DateTimeOriginal": "2024:03:14 22:10:00",
+}
+
+PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
+PDF_TEST_STRINGS = [
+    "While there is contemporaneous exploration of multi-agent approaches"
+]
+
+YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg"
+YOUTUBE_TEST_STRINGS = [
+    "## AutoGen FULL Tutorial with Python (Step-By-Step)",
+    "This is an intermediate tutorial for installing and using AutoGen locally",
+    "PT15M4S",
+    "the model we're going to be using today is GPT 3.5 turbo",  # From the transcript
+]
+
+XLSX_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
+DOCX_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+]
+
+PPTX_TEST_STRINGS = [
+    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+    "1b92870d-e3b5-4e65-8153-919f4ff45592",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+]
+
+BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
+BLOG_TEST_STRINGS = [
+    "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
+    "an example where high cost can easily prevent a generic complex",
+]
+
+WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
+WIKIPEDIA_TEST_STRINGS = [
+    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
+    'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
+]
+WIKIPEDIA_TEST_EXCLUDES = [
+    "You are encouraged to create an account and log in",
+    "154 languages",
+    "move to sidebar",
+]
+
+SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
+SERP_TEST_STRINGS = [
+    "](https://en.wikipedia.org/wiki/Microsoft",
+    "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
+    "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
+]
+SERP_TEST_EXCLUDES = [
+    "https://www.bing.com/ck/a?!&&p=",
+    "data:image/svg+xml,%3Csvg%20width%3D",
+]
+
+
+@pytest.mark.skipif(
+    skip_remote,
+    reason="do not run tests that query external urls",
+)
+def test_markitdown_remote() -> None:
+    markitdown = MarkItDown()
+
+    # By URL
+    result = markitdown.convert(PDF_TEST_URL)
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By stream
+    response = requests.get(PDF_TEST_URL)
+    result = markitdown.convert_stream(
+        io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # Youtube
+    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
+    # result = markitdown.convert(YOUTUBE_TEST_URL)
+    # for test_string in YOUTUBE_TEST_STRINGS:
+    #     assert test_string in result.text_content
+
+
+def test_markitdown_local() -> None:
+    markitdown = MarkItDown()
+
+    # Test XLSX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    for test_string in XLSX_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test DOCX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
+    for test_string in DOCX_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test PPTX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
+    for test_string in PPTX_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test HTML processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
+    )
+    for test_string in BLOG_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test Wikipedia processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
+    )
+    text_content = result.text_content.replace("\\", "")
+    for test_string in WIKIPEDIA_TEST_EXCLUDES:
+        assert test_string not in text_content
+    for test_string in WIKIPEDIA_TEST_STRINGS:
+        assert test_string in text_content
+
+    # Test Bing processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
+    )
+    text_content = result.text_content.replace("\\", "")
+    for test_string in SERP_TEST_EXCLUDES:
+        assert test_string not in text_content
+    for test_string in SERP_TEST_STRINGS:
+        assert test_string in text_content
+
+
+@pytest.mark.skipif(
+    skip_exiftool,
+    reason="do not run if exiftool is not installed",
+)
+def test_markitdown_exiftool() -> None:
+    markitdown = MarkItDown()
+
+    # Test JPG metadata processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+
+if __name__ == "__main__":
+    """Runs this file's tests from the command line."""
+    test_markitdown_remote()
+    test_markitdown_local()
+    test_markitdown_exiftool()