feat: Add GitHub data connector; add Markdown partitioner (#284)

2025-06-27 02:30:08 +00:00 · 2023-02-27 23:36:44 +01:00 · 2023-02-27 23:36:44 +01:00 · ded60afda9
commit ded60afda9
parent c89bba100f
27 changed files with 872 additions and 24 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -108,6 +108,7 @@ jobs:
        make test
        make check-coverage
        make install-ingest-s3
+        make install-ingest-github
        ./test_unstructured_ingest/test-ingest.sh

  changelog:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.4.16-dev4
+## 0.4.16-dev5

 ### Enhancements

@ -7,6 +7,8 @@
 ### Features

 * Added setup script for Ubuntu
+* Added GitHub connector for ingest cli.
+* Added `partition_md` partitioner.
 * Added Reddit connector for ingest cli.

 ### Fixes
--- a/5
+++ b/5
@ -54,6 +54,10 @@ install-build:
 install-ingest-s3:
 	pip install -r requirements/ingest-s3.txt

+.PHONY: install-ingest-github
+install-ingest-github:
+	pip install -r requirements/ingest-github.txt
+
 .PHONY: install-ingest-reddit
 install-ingest-reddit:
 	pip install -r requirements/ingest-reddit.txt
@ -88,6 +92,7 @@ pip-compile:
 	cp requirements/build.txt docs/requirements.txt
 	pip-compile --upgrade --extra=s3     --output-file=requirements/ingest-s3.txt     requirements/base.txt setup.py
 	pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
+	pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py

 ## install-project-local:   install unstructured into your local python environment
 .PHONY: install-project-local
--- a/examples/ingest/github/ingest.sh
+++ b/examples/ingest/github/ingest.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Processes the Unstructured-IO/unstructured repository
+# through Unstructured's library in 2 processes.
+
+# Structured outputs are stored in github-ingest-output/
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"/../../.. || exit 1
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    --github-url Unstructured-IO/unstructured \
+    --github-branch main \
+    --structured-output-dir github-ingest-output \
+    --num-processes 2 \
+    --verbose
+
+# Alternatively, you can call it using:
+# unstructured-ingest --github-url ...
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -20,6 +20,10 @@ charset-normalizer==3.0.1
    # via requests
 click==8.1.3
    # via nltk
+colorama==0.4.6
+    # via
+    #   click
+    #   tqdm
 deprecated==1.2.13
    # via argilla
 et-xmlfile==1.1.0
@ -35,6 +39,8 @@ idna==3.4
    #   anyio
    #   requests
    #   rfc3986
+importlib-metadata==6.0.0
+    # via markdown
 joblib==1.2.0
    # via nltk
 lxml==4.9.2
@ -42,6 +48,8 @@ lxml==4.9.2
    #   python-docx
    #   python-pptx
    #   unstructured (setup.py)
+markdown==3.4.1
+    # via unstructured (setup.py)
 monotonic==1.6
    # via argilla
 nltk==3.8.1
@ -101,3 +109,5 @@ wrapt==1.14.1
    #   deprecated
 xlsxwriter==3.0.8
    # via python-pptx
+zipp==3.15.0
+    # via importlib-metadata
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -16,6 +16,8 @@ certifi==2022.12.7
    #   requests
 charset-normalizer==3.0.1
    # via requests
+colorama==0.4.6
+    # via sphinx
 docutils==0.18.1
    # via
    #   sphinx
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -6,10 +6,6 @@
 #
 anyio==3.6.2
    # via jupyter-server
-appnope==0.1.3
-    # via
-    #   ipykernel
-    #   ipython
 argon2-cffi==21.3.0
    # via
    #   jupyter-server
@ -35,6 +31,11 @@ cffi==1.15.1
    # via argon2-cffi-bindings
 click==8.1.3
    # via pip-tools
+colorama==0.4.6
+    # via
+    #   build
+    #   click
+    #   ipython
 comm==0.1.2
    # via ipykernel
 debugpy==1.6.6
@ -181,8 +182,6 @@ pandocfilters==1.5.0
    # via nbconvert
 parso==0.8.3
    # via jedi
-pexpect==4.8.0
-    # via ipython
 pickleshare==0.7.5
    # via ipython
 pip-tools==6.12.2
@ -202,10 +201,6 @@ prompt-toolkit==3.0.37
    #   jupyter-console
 psutil==5.9.4
    # via ipykernel
-ptyprocess==0.7.0
-    # via
-    #   pexpect
-    #   terminado
 pure-eval==0.2.2
    # via stack-data
 pycparser==2.21
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -22,6 +22,10 @@ click==8.1.3
    # via
    #   nltk
    #   sacremoses
+colorama==0.4.6
+    # via
+    #   click
+    #   tqdm
 deprecated==1.2.13
    # via argilla
 et-xmlfile==1.1.0
@ -43,6 +47,8 @@ idna==3.4
    #   anyio
    #   requests
    #   rfc3986
+importlib-metadata==6.0.0
+    # via markdown
 joblib==1.2.0
    # via
    #   nltk
@ -54,6 +60,8 @@ lxml==4.9.2
    #   python-docx
    #   python-pptx
    #   unstructured (setup.py)
+markdown==3.4.1
+    # via unstructured (setup.py)
 monotonic==1.6
    # via argilla
 nltk==3.8.1
@ -146,3 +154,5 @@ wrapt==1.14.1
    #   deprecated
 xlsxwriter==3.0.8
    # via python-pptx
+zipp==3.15.0
+    # via importlib-metadata
--- a/requirements/ingest-github.txt
+++ b/requirements/ingest-github.txt
@ -0,0 +1,184 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
+#
+anyio==3.6.2
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+argilla==1.3.0
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+backoff==2.2.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+certifi==2022.12.7
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+    #   httpx
+    #   requests
+    #   unstructured (setup.py)
+cffi==1.15.1
+    # via pynacl
+charset-normalizer==3.0.1
+    # via
+    #   -r requirements/base.txt
+    #   requests
+click==8.1.3
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+colorama==0.4.6
+    # via
+    #   click
+    #   tqdm
+deprecated==1.2.13
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   pygithub
+et-xmlfile==1.1.0
+    # via
+    #   -r requirements/base.txt
+    #   openpyxl
+h11==0.14.0
+    # via
+    #   -r requirements/base.txt
+    #   httpcore
+httpcore==0.16.3
+    # via
+    #   -r requirements/base.txt
+    #   httpx
+httpx==0.23.3
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+idna==3.4
+    # via
+    #   -r requirements/base.txt
+    #   anyio
+    #   requests
+    #   rfc3986
+joblib==1.2.0
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+lxml==4.9.2
+    # via
+    #   -r requirements/base.txt
+    #   python-docx
+    #   python-pptx
+    #   unstructured (setup.py)
+monotonic==1.6
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+nltk==3.8.1
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+numpy==1.23.5
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   pandas
+openpyxl==3.1.1
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+packaging==23.0
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+pandas==1.5.3
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   unstructured (setup.py)
+pillow==9.4.0
+    # via
+    #   -r requirements/base.txt
+    #   python-pptx
+    #   unstructured (setup.py)
+pycparser==2.21
+    # via cffi
+pydantic==1.10.4
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+pygithub==1.57.0
+    # via unstructured (setup.py)
+pyjwt==2.6.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+python-dateutil==2.8.2
+    # via
+    #   -r requirements/base.txt
+    #   pandas
+python-docx==0.8.11
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+python-magic==0.4.27
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+python-pptx==0.6.21
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
+pytz==2022.7.1
+    # via
+    #   -r requirements/base.txt
+    #   pandas
+regex==2022.10.31
+    # via
+    #   -r requirements/base.txt
+    #   nltk
+requests==2.28.2
+    # via
+    #   -r requirements/base.txt
+    #   pygithub
+    #   unstructured (setup.py)
+rfc3986[idna2008]==1.5.0
+    # via
+    #   -r requirements/base.txt
+    #   httpx
+six==1.16.0
+    # via
+    #   -r requirements/base.txt
+    #   python-dateutil
+sniffio==1.3.0
+    # via
+    #   -r requirements/base.txt
+    #   anyio
+    #   httpcore
+    #   httpx
+tqdm==4.64.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   nltk
+typing-extensions==4.4.0
+    # via
+    #   -r requirements/base.txt
+    #   pydantic
+urllib3==1.26.14
+    # via
+    #   -r requirements/base.txt
+    #   requests
+wrapt==1.14.1
+    # via
+    #   -r requirements/base.txt
+    #   argilla
+    #   deprecated
+xlsxwriter==3.0.8
+    # via
+    #   -r requirements/base.txt
+    #   python-pptx
--- a/requirements/ingest-s3.txt
+++ b/requirements/ingest-s3.txt
@ -37,6 +37,11 @@ click==8.1.3
    # via
    #   -r requirements/base.txt
    #   nltk
+colorama==0.4.6
+    # via
+    #   -r requirements/base.txt
+    #   click
+    #   tqdm
 deprecated==1.2.13
    # via
    #   -r requirements/base.txt
@ -63,6 +68,10 @@ idna==3.4
    #   anyio
    #   requests
    #   rfc3986
+importlib-metadata==6.0.0
+    # via
+    #   -r requirements/base.txt
+    #   markdown
 jmespath==1.0.1
    # via
    #   boto3
@ -77,6 +86,10 @@ lxml==4.9.2
    #   python-docx
    #   python-pptx
    #   unstructured (setup.py)
+markdown==3.4.1
+    # via
+    #   -r requirements/base.txt
+    #   unstructured (setup.py)
 monotonic==1.6
    # via
    #   -r requirements/base.txt
@ -180,3 +193,7 @@ xlsxwriter==3.0.8
    # via
    #   -r requirements/base.txt
    #   python-pptx
+zipp==3.15.0
+    # via
+    #   -r requirements/base.txt
+    #   importlib-metadata
--- a/requirements/local-inference.txt
+++ b/requirements/local-inference.txt
@ -30,6 +30,10 @@ click==8.1.3
    # via
    #   nltk
    #   uvicorn
+colorama==0.4.6
+    # via
+    #   click
+    #   tqdm
 coloredlogs==15.0.1
    # via onnxruntime
 contourpy==1.0.7
@ -74,6 +78,8 @@ idna==3.4
    #   anyio
    #   requests
    #   rfc3986
+importlib-metadata==6.0.0
+    # via markdown
 importlib-resources==5.12.0
    # via matplotlib
 iopath==0.1.10
@ -91,6 +97,8 @@ lxml==4.9.2
    #   python-docx
    #   python-pptx
    #   unstructured (setup.py)
+markdown==3.4.1
+    # via unstructured (setup.py)
 matplotlib==3.7.0
    # via pycocotools
 monotonic==1.6
@ -165,6 +173,8 @@ pydantic==1.10.5
    #   fastapi
 pyparsing==3.0.9
    # via matplotlib
+pyreadline3==3.4.1
+    # via humanfriendly
 pytesseract==0.3.10
    # via layoutparser
 python-dateutil==2.8.2
--- a/requirements/test.in
+++ b/requirements/test.in
@ -6,6 +6,7 @@ coverage
 click>=8.1
 flake8
 mypy
+types-Markdown
 pytest-cov
 # NOTE(robinson) - Currently tests do not pass with 0.0.18. Added the following
 # issue to address
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -80,6 +80,8 @@ tomli==2.0.1
    #   coverage
    #   mypy
    #   pytest
+types-markdown==3.4.2.5
+    # via -r requirements/test.in
 types-requests==2.28.11.15
    # via -r requirements/test.in
 types-urllib3==1.26.25.8
--- a/setup.py
+++ b/setup.py
@ -59,6 +59,7 @@ setup(
        "python-docx",
        "python-pptx",
        "python-magic",
+        "markdown",
        "requests",
        # NOTE(robinson) - The following dependencies are pinned
        # to address security scans
@ -77,6 +78,11 @@ setup(
            "unstructured-inference>=0.2.4,<0.2.8",
        ],
        "s3": ["boto3"],
+        "github": [
+            # NOTE - pygithub at 1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
+            # In the future, we can update this to pygithub>1.58.0
+            "pygithub==1.57.0",
+        ],
        "reddit": ["praw"],
    },
    package_dir={"unstructured": "unstructured"},
--- a/test_unstructured/partition/test_md.py
+++ b/test_unstructured/partition/test_md.py
@ -0,0 +1,93 @@
+import os
+import pathlib
+from unittest.mock import patch
+
+import pytest
+import requests
+
+from unstructured.documents.elements import PageBreak
+from unstructured.partition.md import partition_md
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+
+def test_partition_md_from_filename():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    elements = partition_md(filename=filename)
+    assert PageBreak() not in elements
+    assert len(elements) > 0
+
+
+def test_partition_md_from_file():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        elements = partition_md(file=f)
+    assert len(elements) > 0
+
+
+def test_partition_md_from_text():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        text = f.read()
+    elements = partition_md(text=text)
+    assert len(elements) > 0
+
+
+class MockResponse:
+    def __init__(self, text, status_code, headers={}):
+        self.text = text
+        self.status_code = status_code
+        self.ok = status_code < 300
+        self.headers = headers
+
+
+def test_partition_md_from_url():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        text = f.read()
+
+    response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/markdown"})
+    with patch.object(requests, "get", return_value=response) as _:
+        elements = partition_md(url="https://fake.url")
+
+    assert len(elements) > 0
+
+
+def test_partition_md_from_url_raises_with_bad_status_code():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        text = f.read()
+
+    response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"})
+    with patch.object(requests, "get", return_value=response) as _:
+        with pytest.raises(ValueError):
+            partition_md(url="https://fake.url")
+
+
+def test_partition_md_from_url_raises_with_bad_content_type():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        text = f.read()
+
+    response = MockResponse(
+        text=text,
+        status_code=200,
+        headers={"Content-Type": "application/json"},
+    )
+    with patch.object(requests, "get", return_value=response) as _:
+        with pytest.raises(ValueError):
+            partition_md(url="https://fake.url")
+
+
+def test_partition_md_raises_with_none_specified():
+    with pytest.raises(ValueError):
+        partition_md()
+
+
+def test_partition_md_raises_with_too_many_specified():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
+    with open(filename) as f:
+        text = f.read()
+
+    with pytest.raises(ValueError):
+        partition_md(filename=filename, text=text)
--- a/test_unstructured_ingest/expected-structured-output/github-downloadify/LICENSE.txt.json
+++ b/test_unstructured_ingest/expected-structured-output/github-downloadify/LICENSE.txt.json
@ -0,0 +1,110 @@
+[
+  {
+    "element_id": "6f348994832b2ad6127af4f7f1736f67",
+    "text": "Downloadify: Client Side File Creation",
+    "type": "Title",
+    "metadata": {}
+  },
+  {
+    "element_id": "074ac796e8f463c50a5d2ec4d047a5b7",
+    "text": "JavaScript + Flash Library",
+    "type": "Title",
+    "metadata": {}
+  },
+  {
+    "element_id": "8dc8800e5660b2558bb7f5f5416ca498",
+    "text": "Copyright (c) 2009 Douglas C. Neiner",
+    "type": "Title",
+    "metadata": {}
+  },
+  {
+    "element_id": "eb281d7b00a856779aaca7d1ec5197a7",
+    "text": "Permission is hereby granted, free of charge, to any person obtaining a copy",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "b41e880594419467436d152970f36710",
+    "text": "of this software and associated documentation files (the \"Software\"), to deal",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "a9578931575204db7971aa2e85137083",
+    "text": "in the Software without restriction, including without limitation the rights",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "7105a363bc50eba8e93f676dbb0bd145",
+    "text": "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "10e76e02d2ddc0fa91590e65249dbbb5",
+    "text": "copies of the Software, and to permit persons to whom the Software is",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "cb2b93515ca0dd50850fd3e1491bf06c",
+    "text": "furnished to do so, subject to the following conditions:",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "ace17038b2bfb49c3882a23be243c016",
+    "text": "The above copyright notice and this permission notice shall be included in",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "4d1f5dcef281e3f580a6c6156a298960",
+    "text": "all copies or substantial portions of the Software.",
+    "type": "Title",
+    "metadata": {}
+  },
+  {
+    "element_id": "58dab889725677ddc5a270a07df8395e",
+    "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "b47e700b9d4e04e4670448bb39067ed2",
+    "text": "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "ff5d35f4e0324c8499b81980c7da4b7c",
+    "text": "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "6dc498cbd6e27db10da2431cfcc32e90",
+    "text": "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "cb64ba82bcfdc75c8d68da657159e00d",
+    "text": "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,",
+    "type": "NarrativeText",
+    "metadata": {}
+  },
+  {
+    "element_id": "ed18e41c2aa38a20e0c256fdc28b7243",
+    "text": "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN",
+    "type": "UncategorizedText",
+    "metadata": {}
+  },
+  {
+    "element_id": "76d8377ccb0743b6c7de1f85b60f3955",
+    "text": "THE SOFTWARE.",
+    "type": "Title",
+    "metadata": {}
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
+++ b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
@ -0,0 +1,50 @@
+[
+  {
+    "element_id": "56a9f768a0968be676f9addd5ec3032e",
+    "text": "Downloadify Example",
+    "type": "Title",
+    "metadata": {
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "d551bbfc9477547e4dce6264d8196c7b",
+    "text": "More info available at the Github Project Page",
+    "type": "Title",
+    "metadata": {
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "971b974235a86ca628dcc713d6e2e8d9",
+    "text": "Filename",
+    "type": "Title",
+    "metadata": {
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "43f65b1c5bd47774b25c72e2f96de300",
+    "text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
+    "type": "UncategorizedText",
+    "metadata": {
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "53a4db70c6d40ed5206711ed8a255e03",
+    "text": "You must have Flash 10 installed to download this file.",
+    "type": "UncategorizedText",
+    "metadata": {
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "839973fba0c850f1729fad098b031203",
+    "text": "Downloadify Invoke Script For This Page",
+    "type": "Title",
+    "metadata": {
+      "page_number": 1
+    }
+  }
+]
--- a/test_unstructured_ingest/test-ingest-github.sh
+++ b/test_unstructured_ingest/test-ingest-github.sh
@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"/.. || exit 1
+
+PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
+
+if ! diff -ru github-downloadify-output test_unstructured_ingest/expected-structured-output/github-downloadify ; then
+   echo
+   echo "There are differences from the previously checked-in structured outputs."
+   echo 
+   echo "If these differences are acceptable, copy the outputs from"
+   echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
+   echo 
+   echo "  PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
+   echo
+   exit 1
+fi
--- a/test_unstructured_ingest/test-ingest.sh
+++ b/test_unstructured_ingest/test-ingest.sh
@ -6,3 +6,4 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd "$SCRIPT_DIR"/.. || exit 1

 ./test_unstructured_ingest/test-ingest-s3.sh
+./test_unstructured_ingest/test-ingest-github.sh
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.16-dev4"  # pragma: no cover
+__version__ = "0.4.16-dev5"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -38,6 +38,11 @@ PPT_MIME_TYPES = [
    "application/vnd.ms-powerpoint",
 ]

+MD_MIME_TYPES = [
+    "text/markdown",
+    "text/x-markdown",
+]
+
 # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
 # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
 # looking for expected filenames within the zip file.
@ -83,6 +88,7 @@ class FileType(Enum):
    # Markup Types
    HTML = 50
    XML = 51
+    MD = 52

    # Compressed Types
    ZIP = 60
@ -102,6 +108,7 @@ EXT_TO_FILETYPE = {
    ".eml": FileType.EML,
    ".xml": FileType.XML,
    ".html": FileType.HTML,
+    ".md": FileType.MD,
    ".xlsx": FileType.XLSX,
    ".pptx": FileType.PPTX,
    ".png": FileType.PNG,
@ -160,16 +167,18 @@ def detect_filetype(
    elif mime_type == "image/png":
        return FileType.PNG

+    elif mime_type in MD_MIME_TYPES:
+        # NOTE - I am not sure whether libmagic ever returns these mimetypes.
+        return FileType.MD
+
    elif mime_type == "text/plain":
        if extension and extension == ".eml":
            return FileType.EML
-        if file and not extension:
-            if _check_eml_from_buffer(file=file) is True:
-                return FileType.EML
-            else:
-                return FileType.TXT
-        else:
-            return FileType.TXT
+        if extension and extension == ".md":
+            return FileType.MD
+        if file and not extension and _check_eml_from_buffer(file=file) is True:
+            return FileType.EML
+        return FileType.TXT

    elif mime_type.endswith("xml"):
        if extension and extension == ".html":
--- a/unstructured/ingest/connector/github.py
+++ b/unstructured/ingest/connector/github.py
@ -0,0 +1,201 @@
+import fnmatch
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from urllib.parse import urlparse
+
+import requests
+
+from unstructured.ingest.interfaces import (
+    BaseConnector,
+    BaseConnectorConfig,
+    BaseIngestDoc,
+)
+
+if TYPE_CHECKING:
+    from github.Repository import Repository
+
+
+@dataclass
+class SimpleGitHubConfig(BaseConnectorConfig):
+    github_url: str
+    github_access_token: Optional[str]
+    github_branch: Optional[str]
+    github_file_glob: Optional[str]
+
+    # Standard Connector options
+    download_dir: str
+    # where to write structured data, with the directory structure matching the github repository
+    output_dir: str
+    preserve_downloads: bool = False
+    re_download: bool = False
+    verbose: bool = False
+
+    repo_owner: str = field(init=False, repr=False)
+    repo_name: str = field(init=False, repr=False)
+
+    def __post_init__(self):
+        parsed_gh_url = urlparse(self.github_url)
+        path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
+
+        # If a scheme and netloc are provided, ensure they are correct
+        # Additionally, ensure that the path contains two fragments
+        if (
+            (parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
+            or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
+            or len(path_fragments) != 2
+        ):
+            raise ValueError(
+                'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
+                ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
+            )
+
+        # If there's no issues, store the core repository info
+        self.repo_owner = path_fragments[0]
+        self.repo_name = path_fragments[1]
+
+
+@dataclass
+class GitHubIngestDoc(BaseIngestDoc):
+    config: SimpleGitHubConfig = field(repr=False)
+    repo: "Repository"
+    path: str
+
+    @property
+    def filename(self):
+        return (Path(self.config.download_dir) / self.path).resolve()
+
+    def _output_filename(self):
+        return Path(self.config.output_dir) / f"{self.path}.json"
+
+    def _create_full_tmp_dir_path(self):
+        """includes directories in in the github repository"""
+        self.filename.parent.mkdir(parents=True, exist_ok=True)
+
+    def cleanup_file(self):
+        """Removes the local copy the file (or anything else) after successful processing."""
+        if not self.config.preserve_downloads:
+            if self.config.verbose:
+                print(f"cleaning up {self}")
+            os.unlink(self.filename)
+
+    def get_file(self):
+        """Fetches the "remote" doc and stores it locally on the filesystem."""
+        self._create_full_tmp_dir_path()
+        if not self.config.re_download and self.filename.is_file() and self.filename.stat():
+            if self.config.verbose:
+                print(f"File exists: {self.filename}, skipping download")
+            return
+
+        if self.config.verbose:
+            print(f"fetching {self} - PID: {os.getpid()}")
+        content_file = self.repo.get_contents(self.path)
+        contents = b""
+        if (
+            not content_file.content  # type: ignore
+            and content_file.encoding == "none"  # type: ignore
+            and content_file.size  # type: ignore
+        ):
+            print("File too large for the GitHub API, using direct download link instead.")
+            response = requests.get(content_file.download_url)  # type: ignore
+            if response.status_code != 200:
+                print("Direct download link has failed... Skipping this file.")
+            else:
+                contents = response.content
+        else:
+            contents = content_file.decoded_content  # type: ignore
+
+        with open(self.filename, "wb") as f:
+            f.write(contents)
+
+    def has_output(self):
+        """Determine if structured output for this doc already exists."""
+        output_filename = self._output_filename()
+        return output_filename.is_file() and output_filename.stat()
+
+    def write_result(self):
+        """Write the structured json result for this doc. result must be json serializable."""
+        output_filename = self._output_filename()
+        output_filename.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_filename, "w", encoding="utf8") as output_f:
+            json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
+        print(f"Wrote {output_filename}")
+
+
+class GitHubConnector(BaseConnector):
+    def __init__(self, config: SimpleGitHubConfig):
+        from github import Github
+
+        self.config = config
+        self.github = Github(self.config.github_access_token)
+        self.cleanup_files = not config.preserve_downloads
+
+    def cleanup(self, cur_dir=None):
+        if not self.cleanup_files:
+            return
+
+        if cur_dir is None:
+            cur_dir = self.config.download_dir
+        sub_dirs = os.listdir(cur_dir)
+        os.chdir(cur_dir)
+        for sub_dir in sub_dirs:
+            # don't traverse symlinks, not that there every should be any
+            if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
+                self.cleanup(sub_dir)
+        os.chdir("..")
+        if len(os.listdir(cur_dir)) == 0:
+            os.rmdir(cur_dir)
+
+    def initialize(self):
+        pass
+
+    def is_file_type_supported(self, path: str) -> bool:
+        # Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
+        # TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
+        supported = path.endswith(
+            (
+                ".md",
+                ".txt",
+                ".pdf",
+                ".doc",
+                ".docx",
+                ".eml",
+                ".html",
+                ".png",
+                ".jpg",
+                ".ppt",
+                ".pptx",
+                ".xml",
+            ),
+        )
+        if not supported and self.config.verbose:
+            print(f"The file {path!r} is discarded as it does not contain a supported filetype.")
+        return supported
+
+    def does_path_match_glob(self, path: str) -> bool:
+        if not self.config.github_file_glob:
+            return True
+        patterns = self.config.github_file_glob.split(",")
+        for pattern in patterns:
+            if fnmatch.filter([path], pattern):
+                return True
+        if self.config.verbose:
+            print(f"The file {path!r} is discarded as it does not match any given glob.")
+        return False
+
+    def get_ingest_docs(self):
+        repo = self.github.get_repo(f"{self.config.repo_owner}/{self.config.repo_name}")
+
+        # Load the Git tree with all files, and then create Ingest docs
+        # for all blobs, i.e. all files, ignoring directories
+        sha = self.config.github_branch or repo.default_branch
+        git_tree = repo.get_git_tree(sha, recursive=True)
+        return [
+            GitHubIngestDoc(self.config, repo, element.path)
+            for element in git_tree.tree
+            if element.type == "blob"
+            and self.is_file_type_supported(element.path)
+            and (not self.config.github_file_glob or self.does_path_match_glob(element.path))
+        ]
--- a/unstructured/ingest/connector/s3_connector.py
+++ b/unstructured/ingest/connector/s3_connector.py
@ -24,8 +24,6 @@ class SimpleS3Config(BaseConnectorConfig):
    output_dir: str
    re_download: bool = False
    preserve_downloads: bool = False
-    # if a structured output .json file already exists, do not reprocess an s3 file to overwrite it
-    reprocess: bool = False
    verbose: bool = False

    # S3 Specific (optional)
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@ -46,8 +46,6 @@ class BaseConnectorConfig(ABC):
    # where to write structured data outputs
    output_dir: str
    re_download: bool = False
-    # if a structured output .json file already exists for a given doc, do not reprocess
-    reprocess: bool = False
    verbose: bool = False


@ -96,7 +94,7 @@ class BaseIngestDoc(ABC):
        self.isd_elems_no_filename = []
        for elem in isd_elems:
            # type: ignore
-            elem["metadata"].pop("filename")  # type: ignore[attr-defined]
+            elem["metadata"].pop("filename", None)  # type: ignore[attr-defined]
            elem.pop("coordinates")  # type: ignore[attr-defined]
            self.isd_elems_no_filename.append(elem)

--- a/unstructured/ingest/main.py
+++ b/unstructured/ingest/main.py
@ -6,6 +6,7 @@ import sys

 import click

+from unstructured.ingest.connector.github import GitHubConnector, SimpleGitHubConfig
 from unstructured.ingest.connector.reddit import RedditConnector, SimpleRedditConfig
 from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
 from unstructured.ingest.doc_processor.generalized import initialize, process_document
@ -79,6 +80,29 @@ class MainProcess:
    default=False,
    help="Connect to s3 without local AWS credentials.",
 )
+@click.option(
+    "--github-url",
+    default=None,
+    help='URL to GitHub repository, e.g. "https://github.com/Unstructured-IO/unstructured",'
+    ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured"',
+)
+@click.option(
+    "--github-access-token",
+    default=None,
+    help="A GitHub access token, see https://docs.github.com/en/authentication",
+)
+@click.option(
+    "--github-branch",
+    default=None,
+    help="The branch for which to fetch files from. If not given,"
+    " the default repository branch is used.",
+)
+@click.option(
+    "--github-file-glob",
+    default=None,
+    help="A comma-separated list of file globs to limit which types of files are accepted,"
+    " e.g. '*.html,*.txt'",
+)
@click.option(
    "--subreddit-name",
    default=None,
@ -148,6 +172,10 @@ class MainProcess:
@click.option("-v", "--verbose", is_flag=True, default=False)
 def main(
    s3_url,
+    github_url,
+    github_access_token,
+    github_branch,
+    github_file_glob,
    subreddit_name,
    reddit_client_id,
    reddit_client_secret,
@ -182,6 +210,21 @@ def main(
                verbose=verbose,
            ),
        )
+    elif github_url:
+        doc_connector = GitHubConnector(  # type: ignore
+            config=SimpleGitHubConfig(
+                github_url=github_url,
+                github_access_token=github_access_token,
+                github_branch=github_branch,
+                github_file_glob=github_file_glob,
+                # defaults params:
+                download_dir=download_dir,
+                preserve_downloads=preserve_downloads,
+                output_dir=structured_output_dir,
+                re_download=re_download,
+                verbose=verbose,
+            ),
+        )
    elif subreddit_name:
        doc_connector = RedditConnector(  # type: ignore
            config=SimpleRedditConfig(
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.image import partition_image
+from unstructured.partition.md import partition_md
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@ -44,6 +45,8 @@ def partition(
        return partition_email(filename=filename, file=file)
    elif filetype == FileType.HTML:
        return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
+    elif filetype == FileType.MD:
+        return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
    elif filetype == FileType.PDF:
        return partition_pdf(
            filename=filename,  # type: ignore
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@ -0,0 +1,60 @@
+from typing import IO, List, Optional, Union
+
+import markdown
+import requests
+
+from unstructured.documents.elements import Element
+from unstructured.documents.xml import VALID_PARSERS
+from unstructured.partition.html import partition_html
+
+
+def optional_decode(contents: Union[str, bytes]) -> str:
+    if isinstance(contents, bytes):
+        return contents.decode("utf-8")
+    return contents
+
+
+def partition_md(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    text: Optional[str] = None,
+    url: Optional[str] = None,
+    include_page_breaks: bool = False,
+    include_metadata: bool = True,
+    parser: VALID_PARSERS = None,
+) -> List[Element]:
+    if not any([filename, file, text, url]):
+        raise ValueError("One of filename, file, or text must be specified.")
+
+    if filename is not None and not file and not text and not url:
+        with open(filename, encoding="utf8") as f:
+            text = optional_decode(f.read())
+
+    elif file is not None and not filename and not text and not url:
+        text = optional_decode(file.read())
+
+    elif text is not None and not filename and not file and not url:
+        pass
+
+    elif url is not None and not filename and not file and not text:
+        response = requests.get(url)
+        if not response.ok:
+            raise ValueError(f"URL return an error: {response.status_code}")
+
+        content_type = response.headers.get("Content-Type", "")
+        if not content_type.startswith("text/markdown"):
+            raise ValueError(f"Expected content type text/markdown. Got {content_type}.")
+
+        text = response.text
+
+    else:
+        raise ValueError("Only one of filename, file, or text can be specified.")
+
+    html = markdown.markdown(text)
+
+    return partition_html(
+        text=html,
+        include_page_breaks=include_page_breaks,
+        include_metadata=include_metadata,
+        parser=parser,
+    )