mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: Add GitHub data connector; add Markdown partitioner (#284)
This commit is contained in:
parent
c89bba100f
commit
ded60afda9
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -108,6 +108,7 @@ jobs:
|
||||
make test
|
||||
make check-coverage
|
||||
make install-ingest-s3
|
||||
make install-ingest-github
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
changelog:
|
||||
|
@ -1,4 +1,4 @@
|
||||
## 0.4.16-dev4
|
||||
## 0.4.16-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,8 @@
|
||||
### Features
|
||||
|
||||
* Added setup script for Ubuntu
|
||||
* Added GitHub connector for ingest cli.
|
||||
* Added `partition_md` partitioner.
|
||||
* Added Reddit connector for ingest cli.
|
||||
|
||||
### Fixes
|
||||
|
5
Makefile
5
Makefile
@ -54,6 +54,10 @@ install-build:
|
||||
install-ingest-s3:
|
||||
pip install -r requirements/ingest-s3.txt
|
||||
|
||||
.PHONY: install-ingest-github
|
||||
install-ingest-github:
|
||||
pip install -r requirements/ingest-github.txt
|
||||
|
||||
.PHONY: install-ingest-reddit
|
||||
install-ingest-reddit:
|
||||
pip install -r requirements/ingest-reddit.txt
|
||||
@ -88,6 +92,7 @@ pip-compile:
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
|
||||
|
||||
## install-project-local: install unstructured into your local python environment
|
||||
.PHONY: install-project-local
|
||||
|
19
examples/ingest/github/ingest.sh
Executable file
19
examples/ingest/github/ingest.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Processes the Unstructured-IO/unstructured repository
|
||||
# through Unstructured's library in 2 processes.
|
||||
|
||||
# Structured outputs are stored in github-ingest-output/
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--github-url Unstructured-IO/unstructured \
|
||||
--github-branch main \
|
||||
--structured-output-dir github-ingest-output \
|
||||
--num-processes 2 \
|
||||
--verbose
|
||||
|
||||
# Alternatively, you can call it using:
|
||||
# unstructured-ingest --github-url ...
|
@ -20,6 +20,10 @@ charset-normalizer==3.0.1
|
||||
# via requests
|
||||
click==8.1.3
|
||||
# via nltk
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# click
|
||||
# tqdm
|
||||
deprecated==1.2.13
|
||||
# via argilla
|
||||
et-xmlfile==1.1.0
|
||||
@ -35,6 +39,8 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
# via markdown
|
||||
joblib==1.2.0
|
||||
# via nltk
|
||||
lxml==4.9.2
|
||||
@ -42,6 +48,8 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
# via unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
nltk==3.8.1
|
||||
@ -101,3 +109,5 @@ wrapt==1.14.1
|
||||
# deprecated
|
||||
xlsxwriter==3.0.8
|
||||
# via python-pptx
|
||||
zipp==3.15.0
|
||||
# via importlib-metadata
|
||||
|
@ -16,6 +16,8 @@ certifi==2022.12.7
|
||||
# requests
|
||||
charset-normalizer==3.0.1
|
||||
# via requests
|
||||
colorama==0.4.6
|
||||
# via sphinx
|
||||
docutils==0.18.1
|
||||
# via
|
||||
# sphinx
|
||||
|
@ -6,10 +6,6 @@
|
||||
#
|
||||
anyio==3.6.2
|
||||
# via jupyter-server
|
||||
appnope==0.1.3
|
||||
# via
|
||||
# ipykernel
|
||||
# ipython
|
||||
argon2-cffi==21.3.0
|
||||
# via
|
||||
# jupyter-server
|
||||
@ -35,6 +31,11 @@ cffi==1.15.1
|
||||
# via argon2-cffi-bindings
|
||||
click==8.1.3
|
||||
# via pip-tools
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# build
|
||||
# click
|
||||
# ipython
|
||||
comm==0.1.2
|
||||
# via ipykernel
|
||||
debugpy==1.6.6
|
||||
@ -181,8 +182,6 @@ pandocfilters==1.5.0
|
||||
# via nbconvert
|
||||
parso==0.8.3
|
||||
# via jedi
|
||||
pexpect==4.8.0
|
||||
# via ipython
|
||||
pickleshare==0.7.5
|
||||
# via ipython
|
||||
pip-tools==6.12.2
|
||||
@ -202,10 +201,6 @@ prompt-toolkit==3.0.37
|
||||
# jupyter-console
|
||||
psutil==5.9.4
|
||||
# via ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via
|
||||
# pexpect
|
||||
# terminado
|
||||
pure-eval==0.2.2
|
||||
# via stack-data
|
||||
pycparser==2.21
|
||||
|
@ -22,6 +22,10 @@ click==8.1.3
|
||||
# via
|
||||
# nltk
|
||||
# sacremoses
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# click
|
||||
# tqdm
|
||||
deprecated==1.2.13
|
||||
# via argilla
|
||||
et-xmlfile==1.1.0
|
||||
@ -43,6 +47,8 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
# via markdown
|
||||
joblib==1.2.0
|
||||
# via
|
||||
# nltk
|
||||
@ -54,6 +60,8 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
# via unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
nltk==3.8.1
|
||||
@ -146,3 +154,5 @@ wrapt==1.14.1
|
||||
# deprecated
|
||||
xlsxwriter==3.0.8
|
||||
# via python-pptx
|
||||
zipp==3.15.0
|
||||
# via importlib-metadata
|
||||
|
184
requirements/ingest-github.txt
Normal file
184
requirements/ingest-github.txt
Normal file
@ -0,0 +1,184 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
|
||||
#
|
||||
anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.3.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
# unstructured (setup.py)
|
||||
cffi==1.15.1
|
||||
# via pynacl
|
||||
charset-normalizer==3.0.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# click
|
||||
# tqdm
|
||||
deprecated==1.2.13
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pygithub
|
||||
et-xmlfile==1.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# openpyxl
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
httpcore==0.16.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
httpx==0.23.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
idna==3.4
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
joblib==1.2.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
lxml==4.9.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
nltk==3.8.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
packaging==23.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pandas==1.5.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# unstructured (setup.py)
|
||||
pillow==9.4.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydantic==1.10.4
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pygithub==1.57.0
|
||||
# via unstructured (setup.py)
|
||||
pyjwt==2.6.0
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
python-docx==0.8.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-magic==0.4.27
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-pptx==0.6.21
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
requests==2.28.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pygithub
|
||||
# unstructured (setup.py)
|
||||
rfc3986[idna2008]==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
six==1.16.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
tqdm==4.64.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# nltk
|
||||
typing-extensions==4.4.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
urllib3==1.26.14
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# deprecated
|
||||
xlsxwriter==3.0.8
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
@ -37,6 +37,11 @@ click==8.1.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# click
|
||||
# tqdm
|
||||
deprecated==1.2.13
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -63,6 +68,10 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
jmespath==1.0.1
|
||||
# via
|
||||
# boto3
|
||||
@ -77,6 +86,10 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -180,3 +193,7 @@ xlsxwriter==3.0.8
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
zipp==3.15.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# importlib-metadata
|
||||
|
@ -30,6 +30,10 @@ click==8.1.3
|
||||
# via
|
||||
# nltk
|
||||
# uvicorn
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# click
|
||||
# tqdm
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.0.7
|
||||
@ -74,6 +78,8 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
# via markdown
|
||||
importlib-resources==5.12.0
|
||||
# via matplotlib
|
||||
iopath==0.1.10
|
||||
@ -91,6 +97,8 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
# via unstructured (setup.py)
|
||||
matplotlib==3.7.0
|
||||
# via pycocotools
|
||||
monotonic==1.6
|
||||
@ -165,6 +173,8 @@ pydantic==1.10.5
|
||||
# fastapi
|
||||
pyparsing==3.0.9
|
||||
# via matplotlib
|
||||
pyreadline3==3.4.1
|
||||
# via humanfriendly
|
||||
pytesseract==0.3.10
|
||||
# via layoutparser
|
||||
python-dateutil==2.8.2
|
||||
|
@ -6,6 +6,7 @@ coverage
|
||||
click>=8.1
|
||||
flake8
|
||||
mypy
|
||||
types-Markdown
|
||||
pytest-cov
|
||||
# NOTE(robinson) - Currently tests do not pass with 0.0.18. Added the following
|
||||
# issue to address
|
||||
|
@ -80,6 +80,8 @@ tomli==2.0.1
|
||||
# coverage
|
||||
# mypy
|
||||
# pytest
|
||||
types-markdown==3.4.2.5
|
||||
# via -r requirements/test.in
|
||||
types-requests==2.28.11.15
|
||||
# via -r requirements/test.in
|
||||
types-urllib3==1.26.25.8
|
||||
|
6
setup.py
6
setup.py
@ -59,6 +59,7 @@ setup(
|
||||
"python-docx",
|
||||
"python-pptx",
|
||||
"python-magic",
|
||||
"markdown",
|
||||
"requests",
|
||||
# NOTE(robinson) - The following dependencies are pinned
|
||||
# to address security scans
|
||||
@ -77,6 +78,11 @@ setup(
|
||||
"unstructured-inference>=0.2.4,<0.2.8",
|
||||
],
|
||||
"s3": ["boto3"],
|
||||
"github": [
|
||||
# NOTE - pygithub at 1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
|
||||
# In the future, we can update this to pygithub>1.58.0
|
||||
"pygithub==1.57.0",
|
||||
],
|
||||
"reddit": ["praw"],
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
|
93
test_unstructured/partition/test_md.py
Normal file
93
test_unstructured/partition/test_md.py
Normal file
@ -0,0 +1,93 @@
|
||||
import os
|
||||
import pathlib
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import PageBreak
|
||||
from unstructured.partition.md import partition_md
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def test_partition_md_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert PageBreak() not in elements
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_partition_md_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_partition_md_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
elements = partition_md(text=text)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, text, status_code, headers={}):
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
self.ok = status_code < 300
|
||||
self.headers = headers
|
||||
|
||||
|
||||
def test_partition_md_from_url():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/markdown"})
|
||||
with patch.object(requests, "get", return_value=response) as _:
|
||||
elements = partition_md(url="https://fake.url")
|
||||
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"})
|
||||
with patch.object(requests, "get", return_value=response) as _:
|
||||
with pytest.raises(ValueError):
|
||||
partition_md(url="https://fake.url")
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_content_type():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
response = MockResponse(
|
||||
text=text,
|
||||
status_code=200,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
with patch.object(requests, "get", return_value=response) as _:
|
||||
with pytest.raises(ValueError):
|
||||
partition_md(url="https://fake.url")
|
||||
|
||||
|
||||
def test_partition_md_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_md()
|
||||
|
||||
|
||||
def test_partition_md_raises_with_too_many_specified():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_md(filename=filename, text=text)
|
@ -0,0 +1,110 @@
|
||||
[
|
||||
{
|
||||
"element_id": "6f348994832b2ad6127af4f7f1736f67",
|
||||
"text": "Downloadify: Client Side File Creation",
|
||||
"type": "Title",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "074ac796e8f463c50a5d2ec4d047a5b7",
|
||||
"text": "JavaScript + Flash Library",
|
||||
"type": "Title",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "8dc8800e5660b2558bb7f5f5416ca498",
|
||||
"text": "Copyright (c) 2009 Douglas C. Neiner",
|
||||
"type": "Title",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "eb281d7b00a856779aaca7d1ec5197a7",
|
||||
"text": "Permission is hereby granted, free of charge, to any person obtaining a copy",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "b41e880594419467436d152970f36710",
|
||||
"text": "of this software and associated documentation files (the \"Software\"), to deal",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "a9578931575204db7971aa2e85137083",
|
||||
"text": "in the Software without restriction, including without limitation the rights",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "7105a363bc50eba8e93f676dbb0bd145",
|
||||
"text": "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "10e76e02d2ddc0fa91590e65249dbbb5",
|
||||
"text": "copies of the Software, and to permit persons to whom the Software is",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "cb2b93515ca0dd50850fd3e1491bf06c",
|
||||
"text": "furnished to do so, subject to the following conditions:",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "ace17038b2bfb49c3882a23be243c016",
|
||||
"text": "The above copyright notice and this permission notice shall be included in",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "4d1f5dcef281e3f580a6c6156a298960",
|
||||
"text": "all copies or substantial portions of the Software.",
|
||||
"type": "Title",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "58dab889725677ddc5a270a07df8395e",
|
||||
"text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "b47e700b9d4e04e4670448bb39067ed2",
|
||||
"text": "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "ff5d35f4e0324c8499b81980c7da4b7c",
|
||||
"text": "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "6dc498cbd6e27db10da2431cfcc32e90",
|
||||
"text": "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "cb64ba82bcfdc75c8d68da657159e00d",
|
||||
"text": "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "ed18e41c2aa38a20e0c256fdc28b7243",
|
||||
"text": "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN",
|
||||
"type": "UncategorizedText",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"element_id": "76d8377ccb0743b6c7de1f85b60f3955",
|
||||
"text": "THE SOFTWARE.",
|
||||
"type": "Title",
|
||||
"metadata": {}
|
||||
}
|
||||
]
|
@ -0,0 +1,50 @@
|
||||
[
|
||||
{
|
||||
"element_id": "56a9f768a0968be676f9addd5ec3032e",
|
||||
"text": "Downloadify Example",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d551bbfc9477547e4dce6264d8196c7b",
|
||||
"text": "More info available at the Github Project Page",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "971b974235a86ca628dcc713d6e2e8d9",
|
||||
"text": "Filename",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
|
||||
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
|
||||
"type": "UncategorizedText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "53a4db70c6d40ed5206711ed8a255e03",
|
||||
"text": "You must have Flash 10 installed to download this file.",
|
||||
"type": "UncategorizedText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "839973fba0c850f1729fad098b031203",
|
||||
"text": "Downloadify Invoke Script For This Page",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
}
|
||||
]
|
18
test_unstructured_ingest/test-ingest-github.sh
Executable file
18
test_unstructured_ingest/test-ingest-github.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
|
||||
|
||||
if ! diff -ru github-downloadify-output test_unstructured_ingest/expected-structured-output/github-downloadify ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, copy the outputs from"
|
||||
echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
|
||||
echo
|
||||
echo " PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
|
||||
echo
|
||||
exit 1
|
||||
fi
|
@ -6,3 +6,4 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
./test_unstructured_ingest/test-ingest-s3.sh
|
||||
./test_unstructured_ingest/test-ingest-github.sh
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.4.16-dev4" # pragma: no cover
|
||||
__version__ = "0.4.16-dev5" # pragma: no cover
|
||||
|
@ -38,6 +38,11 @@ PPT_MIME_TYPES = [
|
||||
"application/vnd.ms-powerpoint",
|
||||
]
|
||||
|
||||
MD_MIME_TYPES = [
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
]
|
||||
|
||||
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
|
||||
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
|
||||
# looking for expected filenames within the zip file.
|
||||
@ -83,6 +88,7 @@ class FileType(Enum):
|
||||
# Markup Types
|
||||
HTML = 50
|
||||
XML = 51
|
||||
MD = 52
|
||||
|
||||
# Compressed Types
|
||||
ZIP = 60
|
||||
@ -102,6 +108,7 @@ EXT_TO_FILETYPE = {
|
||||
".eml": FileType.EML,
|
||||
".xml": FileType.XML,
|
||||
".html": FileType.HTML,
|
||||
".md": FileType.MD,
|
||||
".xlsx": FileType.XLSX,
|
||||
".pptx": FileType.PPTX,
|
||||
".png": FileType.PNG,
|
||||
@ -160,16 +167,18 @@ def detect_filetype(
|
||||
elif mime_type == "image/png":
|
||||
return FileType.PNG
|
||||
|
||||
elif mime_type in MD_MIME_TYPES:
|
||||
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
|
||||
return FileType.MD
|
||||
|
||||
elif mime_type == "text/plain":
|
||||
if extension and extension == ".eml":
|
||||
return FileType.EML
|
||||
if file and not extension:
|
||||
if _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
else:
|
||||
return FileType.TXT
|
||||
else:
|
||||
return FileType.TXT
|
||||
if extension and extension == ".md":
|
||||
return FileType.MD
|
||||
if file and not extension and _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
return FileType.TXT
|
||||
|
||||
elif mime_type.endswith("xml"):
|
||||
if extension and extension == ".html":
|
||||
|
201
unstructured/ingest/connector/github.py
Normal file
201
unstructured/ingest/connector/github.py
Normal file
@ -0,0 +1,201 @@
|
||||
import fnmatch
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from github.Repository import Repository
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleGitHubConfig(BaseConnectorConfig):
|
||||
github_url: str
|
||||
github_access_token: Optional[str]
|
||||
github_branch: Optional[str]
|
||||
github_file_glob: Optional[str]
|
||||
|
||||
# Standard Connector options
|
||||
download_dir: str
|
||||
# where to write structured data, with the directory structure matching the github repository
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
repo_owner: str = field(init=False, repr=False)
|
||||
repo_name: str = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self):
|
||||
parsed_gh_url = urlparse(self.github_url)
|
||||
path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
|
||||
|
||||
# If a scheme and netloc are provided, ensure they are correct
|
||||
# Additionally, ensure that the path contains two fragments
|
||||
if (
|
||||
(parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
|
||||
or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
|
||||
or len(path_fragments) != 2
|
||||
):
|
||||
raise ValueError(
|
||||
'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
|
||||
' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
|
||||
)
|
||||
|
||||
# If there's no issues, store the core repository info
|
||||
self.repo_owner = path_fragments[0]
|
||||
self.repo_name = path_fragments[1]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GitHubIngestDoc(BaseIngestDoc):
|
||||
config: SimpleGitHubConfig = field(repr=False)
|
||||
repo: "Repository"
|
||||
path: str
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return (Path(self.config.download_dir) / self.path).resolve()
|
||||
|
||||
def _output_filename(self):
|
||||
return Path(self.config.output_dir) / f"{self.path}.json"
|
||||
|
||||
def _create_full_tmp_dir_path(self):
|
||||
"""includes directories in in the github repository"""
|
||||
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_file(self):
|
||||
"""Removes the local copy the file (or anything else) after successful processing."""
|
||||
if not self.config.preserve_downloads:
|
||||
if self.config.verbose:
|
||||
print(f"cleaning up {self}")
|
||||
os.unlink(self.filename)
|
||||
|
||||
def get_file(self):
|
||||
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
||||
self._create_full_tmp_dir_path()
|
||||
if not self.config.re_download and self.filename.is_file() and self.filename.stat():
|
||||
if self.config.verbose:
|
||||
print(f"File exists: {self.filename}, skipping download")
|
||||
return
|
||||
|
||||
if self.config.verbose:
|
||||
print(f"fetching {self} - PID: {os.getpid()}")
|
||||
content_file = self.repo.get_contents(self.path)
|
||||
contents = b""
|
||||
if (
|
||||
not content_file.content # type: ignore
|
||||
and content_file.encoding == "none" # type: ignore
|
||||
and content_file.size # type: ignore
|
||||
):
|
||||
print("File too large for the GitHub API, using direct download link instead.")
|
||||
response = requests.get(content_file.download_url) # type: ignore
|
||||
if response.status_code != 200:
|
||||
print("Direct download link has failed... Skipping this file.")
|
||||
else:
|
||||
contents = response.content
|
||||
else:
|
||||
contents = content_file.decoded_content # type: ignore
|
||||
|
||||
with open(self.filename, "wb") as f:
|
||||
f.write(contents)
|
||||
|
||||
def has_output(self):
|
||||
"""Determine if structured output for this doc already exists."""
|
||||
output_filename = self._output_filename()
|
||||
return output_filename.is_file() and output_filename.stat()
|
||||
|
||||
def write_result(self):
|
||||
"""Write the structured json result for this doc. result must be json serializable."""
|
||||
output_filename = self._output_filename()
|
||||
output_filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_filename, "w", encoding="utf8") as output_f:
|
||||
json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
|
||||
print(f"Wrote {output_filename}")
|
||||
|
||||
|
||||
class GitHubConnector(BaseConnector):
|
||||
def __init__(self, config: SimpleGitHubConfig):
|
||||
from github import Github
|
||||
|
||||
self.config = config
|
||||
self.github = Github(self.config.github_access_token)
|
||||
self.cleanup_files = not config.preserve_downloads
|
||||
|
||||
def cleanup(self, cur_dir=None):
|
||||
if not self.cleanup_files:
|
||||
return
|
||||
|
||||
if cur_dir is None:
|
||||
cur_dir = self.config.download_dir
|
||||
sub_dirs = os.listdir(cur_dir)
|
||||
os.chdir(cur_dir)
|
||||
for sub_dir in sub_dirs:
|
||||
# don't traverse symlinks, not that there every should be any
|
||||
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
|
||||
self.cleanup(sub_dir)
|
||||
os.chdir("..")
|
||||
if len(os.listdir(cur_dir)) == 0:
|
||||
os.rmdir(cur_dir)
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def is_file_type_supported(self, path: str) -> bool:
|
||||
# Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
|
||||
# TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
|
||||
supported = path.endswith(
|
||||
(
|
||||
".md",
|
||||
".txt",
|
||||
".pdf",
|
||||
".doc",
|
||||
".docx",
|
||||
".eml",
|
||||
".html",
|
||||
".png",
|
||||
".jpg",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".xml",
|
||||
),
|
||||
)
|
||||
if not supported and self.config.verbose:
|
||||
print(f"The file {path!r} is discarded as it does not contain a supported filetype.")
|
||||
return supported
|
||||
|
||||
def does_path_match_glob(self, path: str) -> bool:
|
||||
if not self.config.github_file_glob:
|
||||
return True
|
||||
patterns = self.config.github_file_glob.split(",")
|
||||
for pattern in patterns:
|
||||
if fnmatch.filter([path], pattern):
|
||||
return True
|
||||
if self.config.verbose:
|
||||
print(f"The file {path!r} is discarded as it does not match any given glob.")
|
||||
return False
|
||||
|
||||
def get_ingest_docs(self):
|
||||
repo = self.github.get_repo(f"{self.config.repo_owner}/{self.config.repo_name}")
|
||||
|
||||
# Load the Git tree with all files, and then create Ingest docs
|
||||
# for all blobs, i.e. all files, ignoring directories
|
||||
sha = self.config.github_branch or repo.default_branch
|
||||
git_tree = repo.get_git_tree(sha, recursive=True)
|
||||
return [
|
||||
GitHubIngestDoc(self.config, repo, element.path)
|
||||
for element in git_tree.tree
|
||||
if element.type == "blob"
|
||||
and self.is_file_type_supported(element.path)
|
||||
and (not self.config.github_file_glob or self.does_path_match_glob(element.path))
|
||||
]
|
@ -24,8 +24,6 @@ class SimpleS3Config(BaseConnectorConfig):
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
preserve_downloads: bool = False
|
||||
# if a structured output .json file already exists, do not reprocess an s3 file to overwrite it
|
||||
reprocess: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
# S3 Specific (optional)
|
||||
|
@ -46,8 +46,6 @@ class BaseConnectorConfig(ABC):
|
||||
# where to write structured data outputs
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
# if a structured output .json file already exists for a given doc, do not reprocess
|
||||
reprocess: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
|
||||
@ -96,7 +94,7 @@ class BaseIngestDoc(ABC):
|
||||
self.isd_elems_no_filename = []
|
||||
for elem in isd_elems:
|
||||
# type: ignore
|
||||
elem["metadata"].pop("filename") # type: ignore[attr-defined]
|
||||
elem["metadata"].pop("filename", None) # type: ignore[attr-defined]
|
||||
elem.pop("coordinates") # type: ignore[attr-defined]
|
||||
self.isd_elems_no_filename.append(elem)
|
||||
|
||||
|
@ -6,6 +6,7 @@ import sys
|
||||
|
||||
import click
|
||||
|
||||
from unstructured.ingest.connector.github import GitHubConnector, SimpleGitHubConfig
|
||||
from unstructured.ingest.connector.reddit import RedditConnector, SimpleRedditConfig
|
||||
from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
|
||||
from unstructured.ingest.doc_processor.generalized import initialize, process_document
|
||||
@ -79,6 +80,29 @@ class MainProcess:
|
||||
default=False,
|
||||
help="Connect to s3 without local AWS credentials.",
|
||||
)
|
||||
@click.option(
|
||||
"--github-url",
|
||||
default=None,
|
||||
help='URL to GitHub repository, e.g. "https://github.com/Unstructured-IO/unstructured",'
|
||||
' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured"',
|
||||
)
|
||||
@click.option(
|
||||
"--github-access-token",
|
||||
default=None,
|
||||
help="A GitHub access token, see https://docs.github.com/en/authentication",
|
||||
)
|
||||
@click.option(
|
||||
"--github-branch",
|
||||
default=None,
|
||||
help="The branch for which to fetch files from. If not given,"
|
||||
" the default repository branch is used.",
|
||||
)
|
||||
@click.option(
|
||||
"--github-file-glob",
|
||||
default=None,
|
||||
help="A comma-separated list of file globs to limit which types of files are accepted,"
|
||||
" e.g. '*.html,*.txt'",
|
||||
)
|
||||
@click.option(
|
||||
"--subreddit-name",
|
||||
default=None,
|
||||
@ -148,6 +172,10 @@ class MainProcess:
|
||||
@click.option("-v", "--verbose", is_flag=True, default=False)
|
||||
def main(
|
||||
s3_url,
|
||||
github_url,
|
||||
github_access_token,
|
||||
github_branch,
|
||||
github_file_glob,
|
||||
subreddit_name,
|
||||
reddit_client_id,
|
||||
reddit_client_secret,
|
||||
@ -182,6 +210,21 @@ def main(
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
elif github_url:
|
||||
doc_connector = GitHubConnector( # type: ignore
|
||||
config=SimpleGitHubConfig(
|
||||
github_url=github_url,
|
||||
github_access_token=github_access_token,
|
||||
github_branch=github_branch,
|
||||
github_file_glob=github_file_glob,
|
||||
# defaults params:
|
||||
download_dir=download_dir,
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
elif subreddit_name:
|
||||
doc_connector = RedditConnector( # type: ignore
|
||||
config=SimpleRedditConfig(
|
||||
|
@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
@ -44,6 +45,8 @@ def partition(
|
||||
return partition_email(filename=filename, file=file)
|
||||
elif filetype == FileType.HTML:
|
||||
return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elif filetype == FileType.MD:
|
||||
return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elif filetype == FileType.PDF:
|
||||
return partition_pdf(
|
||||
filename=filename, # type: ignore
|
||||
|
60
unstructured/partition/md.py
Normal file
60
unstructured/partition/md.py
Normal file
@ -0,0 +1,60 @@
|
||||
from typing import IO, List, Optional, Union
|
||||
|
||||
import markdown
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.xml import VALID_PARSERS
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
||||
def optional_decode(contents: Union[str, bytes]) -> str:
|
||||
if isinstance(contents, bytes):
|
||||
return contents.decode("utf-8")
|
||||
return contents
|
||||
|
||||
|
||||
def partition_md(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
parser: VALID_PARSERS = None,
|
||||
) -> List[Element]:
|
||||
if not any([filename, file, text, url]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if filename is not None and not file and not text and not url:
|
||||
with open(filename, encoding="utf8") as f:
|
||||
text = optional_decode(f.read())
|
||||
|
||||
elif file is not None and not filename and not text and not url:
|
||||
text = optional_decode(file.read())
|
||||
|
||||
elif text is not None and not filename and not file and not url:
|
||||
pass
|
||||
|
||||
elif url is not None and not filename and not file and not text:
|
||||
response = requests.get(url)
|
||||
if not response.ok:
|
||||
raise ValueError(f"URL return an error: {response.status_code}")
|
||||
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if not content_type.startswith("text/markdown"):
|
||||
raise ValueError(f"Expected content type text/markdown. Got {content_type}.")
|
||||
|
||||
text = response.text
|
||||
|
||||
else:
|
||||
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||
|
||||
html = markdown.markdown(text)
|
||||
|
||||
return partition_html(
|
||||
text=html,
|
||||
include_page_breaks=include_page_breaks,
|
||||
include_metadata=include_metadata,
|
||||
parser=parser,
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user