feat: Add GitHub data connector; add Markdown partitioner (#284)

This commit is contained in:
Tom Aarsen 2023-02-27 23:36:44 +01:00 committed by GitHub
parent c89bba100f
commit ded60afda9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 872 additions and 24 deletions

View File

@ -108,6 +108,7 @@ jobs:
make test
make check-coverage
make install-ingest-s3
make install-ingest-github
./test_unstructured_ingest/test-ingest.sh
changelog:

View File

@ -1,4 +1,4 @@
## 0.4.16-dev4
## 0.4.16-dev5
### Enhancements
@ -7,6 +7,8 @@
### Features
* Added setup script for Ubuntu
* Added GitHub connector for ingest cli.
* Added `partition_md` partitioner.
* Added Reddit connector for ingest cli.
### Fixes

View File

@ -54,6 +54,10 @@ install-build:
install-ingest-s3:
pip install -r requirements/ingest-s3.txt
.PHONY: install-ingest-github
install-ingest-github:
pip install -r requirements/ingest-github.txt
.PHONY: install-ingest-reddit
install-ingest-reddit:
pip install -r requirements/ingest-reddit.txt
@ -88,6 +92,7 @@ pip-compile:
cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
## install-project-local: install unstructured into your local python environment
.PHONY: install-project-local

View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Processes the Unstructured-IO/unstructured repository
# through Unstructured's library in 2 processes.
# Structured outputs are stored in github-ingest-output/
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--github-url Unstructured-IO/unstructured \
--github-branch main \
--structured-output-dir github-ingest-output \
--num-processes 2 \
--verbose
# Alternatively, you can call it using:
# unstructured-ingest --github-url ...

View File

@ -20,6 +20,10 @@ charset-normalizer==3.0.1
# via requests
click==8.1.3
# via nltk
colorama==0.4.6
# via
# click
# tqdm
deprecated==1.2.13
# via argilla
et-xmlfile==1.1.0
@ -35,6 +39,8 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
# via markdown
joblib==1.2.0
# via nltk
lxml==4.9.2
@ -42,6 +48,8 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
# via unstructured (setup.py)
monotonic==1.6
# via argilla
nltk==3.8.1
@ -101,3 +109,5 @@ wrapt==1.14.1
# deprecated
xlsxwriter==3.0.8
# via python-pptx
zipp==3.15.0
# via importlib-metadata

View File

@ -16,6 +16,8 @@ certifi==2022.12.7
# requests
charset-normalizer==3.0.1
# via requests
colorama==0.4.6
# via sphinx
docutils==0.18.1
# via
# sphinx

View File

@ -6,10 +6,6 @@
#
anyio==3.6.2
# via jupyter-server
appnope==0.1.3
# via
# ipykernel
# ipython
argon2-cffi==21.3.0
# via
# jupyter-server
@ -35,6 +31,11 @@ cffi==1.15.1
# via argon2-cffi-bindings
click==8.1.3
# via pip-tools
colorama==0.4.6
# via
# build
# click
# ipython
comm==0.1.2
# via ipykernel
debugpy==1.6.6
@ -181,8 +182,6 @@ pandocfilters==1.5.0
# via nbconvert
parso==0.8.3
# via jedi
pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pip-tools==6.12.2
@ -202,10 +201,6 @@ prompt-toolkit==3.0.37
# jupyter-console
psutil==5.9.4
# via ipykernel
ptyprocess==0.7.0
# via
# pexpect
# terminado
pure-eval==0.2.2
# via stack-data
pycparser==2.21

View File

@ -22,6 +22,10 @@ click==8.1.3
# via
# nltk
# sacremoses
colorama==0.4.6
# via
# click
# tqdm
deprecated==1.2.13
# via argilla
et-xmlfile==1.1.0
@ -43,6 +47,8 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
# via markdown
joblib==1.2.0
# via
# nltk
@ -54,6 +60,8 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
# via unstructured (setup.py)
monotonic==1.6
# via argilla
nltk==3.8.1
@ -146,3 +154,5 @@ wrapt==1.14.1
# deprecated
xlsxwriter==3.0.8
# via python-pptx
zipp==3.15.0
# via importlib-metadata

View File

@ -0,0 +1,184 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
#
anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.3.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
backoff==2.2.1
# via
# -r requirements/base.txt
# argilla
certifi==2022.12.7
# via
# -r requirements/base.txt
# httpcore
# httpx
# requests
# unstructured (setup.py)
cffi==1.15.1
# via pynacl
charset-normalizer==3.0.1
# via
# -r requirements/base.txt
# requests
click==8.1.3
# via
# -r requirements/base.txt
# nltk
colorama==0.4.6
# via
# click
# tqdm
deprecated==1.2.13
# via
# -r requirements/base.txt
# argilla
# pygithub
et-xmlfile==1.1.0
# via
# -r requirements/base.txt
# openpyxl
h11==0.14.0
# via
# -r requirements/base.txt
# httpcore
httpcore==0.16.3
# via
# -r requirements/base.txt
# httpx
httpx==0.23.3
# via
# -r requirements/base.txt
# argilla
idna==3.4
# via
# -r requirements/base.txt
# anyio
# requests
# rfc3986
joblib==1.2.0
# via
# -r requirements/base.txt
# nltk
lxml==4.9.2
# via
# -r requirements/base.txt
# python-docx
# python-pptx
# unstructured (setup.py)
monotonic==1.6
# via
# -r requirements/base.txt
# argilla
nltk==3.8.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
numpy==1.23.5
# via
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
packaging==23.0
# via
# -r requirements/base.txt
# argilla
pandas==1.5.3
# via
# -r requirements/base.txt
# argilla
# unstructured (setup.py)
pillow==9.4.0
# via
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pycparser==2.21
# via cffi
pydantic==1.10.4
# via
# -r requirements/base.txt
# argilla
pygithub==1.57.0
# via unstructured (setup.py)
pyjwt==2.6.0
# via pygithub
pynacl==1.5.0
# via pygithub
python-dateutil==2.8.2
# via
# -r requirements/base.txt
# pandas
python-docx==0.8.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-magic==0.4.27
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-pptx==0.6.21
# via
# -r requirements/base.txt
# unstructured (setup.py)
pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
# via
# -r requirements/base.txt
# nltk
requests==2.28.2
# via
# -r requirements/base.txt
# pygithub
# unstructured (setup.py)
rfc3986[idna2008]==1.5.0
# via
# -r requirements/base.txt
# httpx
six==1.16.0
# via
# -r requirements/base.txt
# python-dateutil
sniffio==1.3.0
# via
# -r requirements/base.txt
# anyio
# httpcore
# httpx
tqdm==4.64.1
# via
# -r requirements/base.txt
# argilla
# nltk
typing-extensions==4.4.0
# via
# -r requirements/base.txt
# pydantic
urllib3==1.26.14
# via
# -r requirements/base.txt
# requests
wrapt==1.14.1
# via
# -r requirements/base.txt
# argilla
# deprecated
xlsxwriter==3.0.8
# via
# -r requirements/base.txt
# python-pptx

View File

@ -37,6 +37,11 @@ click==8.1.3
# via
# -r requirements/base.txt
# nltk
colorama==0.4.6
# via
# -r requirements/base.txt
# click
# tqdm
deprecated==1.2.13
# via
# -r requirements/base.txt
@ -63,6 +68,10 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
# via
# -r requirements/base.txt
# markdown
jmespath==1.0.1
# via
# boto3
@ -77,6 +86,10 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
monotonic==1.6
# via
# -r requirements/base.txt
@ -180,3 +193,7 @@ xlsxwriter==3.0.8
# via
# -r requirements/base.txt
# python-pptx
zipp==3.15.0
# via
# -r requirements/base.txt
# importlib-metadata

View File

@ -30,6 +30,10 @@ click==8.1.3
# via
# nltk
# uvicorn
colorama==0.4.6
# via
# click
# tqdm
coloredlogs==15.0.1
# via onnxruntime
contourpy==1.0.7
@ -74,6 +78,8 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
# via markdown
importlib-resources==5.12.0
# via matplotlib
iopath==0.1.10
@ -91,6 +97,8 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
# via unstructured (setup.py)
matplotlib==3.7.0
# via pycocotools
monotonic==1.6
@ -165,6 +173,8 @@ pydantic==1.10.5
# fastapi
pyparsing==3.0.9
# via matplotlib
pyreadline3==3.4.1
# via humanfriendly
pytesseract==0.3.10
# via layoutparser
python-dateutil==2.8.2

View File

@ -6,6 +6,7 @@ coverage
click>=8.1
flake8
mypy
types-Markdown
pytest-cov
# NOTE(robinson) - Currently tests do not pass with 0.0.18. Added the following
# issue to address

View File

@ -80,6 +80,8 @@ tomli==2.0.1
# coverage
# mypy
# pytest
types-markdown==3.4.2.5
# via -r requirements/test.in
types-requests==2.28.11.15
# via -r requirements/test.in
types-urllib3==1.26.25.8

View File

@ -59,6 +59,7 @@ setup(
"python-docx",
"python-pptx",
"python-magic",
"markdown",
"requests",
# NOTE(robinson) - The following dependencies are pinned
# to address security scans
@ -77,6 +78,11 @@ setup(
"unstructured-inference>=0.2.4,<0.2.8",
],
"s3": ["boto3"],
"github": [
# NOTE - pygithub at 1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
# In the future, we can update this to pygithub>1.58.0
"pygithub==1.57.0",
],
"reddit": ["praw"],
},
package_dir={"unstructured": "unstructured"},

View File

@ -0,0 +1,93 @@
import os
import pathlib
from unittest.mock import patch
import pytest
import requests
from unstructured.documents.elements import PageBreak
from unstructured.partition.md import partition_md
DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_md_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
elements = partition_md(filename=filename)
assert PageBreak() not in elements
assert len(elements) > 0
def test_partition_md_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
elements = partition_md(file=f)
assert len(elements) > 0
def test_partition_md_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
elements = partition_md(text=text)
assert len(elements) > 0
class MockResponse:
def __init__(self, text, status_code, headers={}):
self.text = text
self.status_code = status_code
self.ok = status_code < 300
self.headers = headers
def test_partition_md_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/markdown"})
with patch.object(requests, "get", return_value=response) as _:
elements = partition_md(url="https://fake.url")
assert len(elements) > 0
def test_partition_md_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"})
with patch.object(requests, "get", return_value=response) as _:
with pytest.raises(ValueError):
partition_md(url="https://fake.url")
def test_partition_md_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
response = MockResponse(
text=text,
status_code=200,
headers={"Content-Type": "application/json"},
)
with patch.object(requests, "get", return_value=response) as _:
with pytest.raises(ValueError):
partition_md(url="https://fake.url")
def test_partition_md_raises_with_none_specified():
with pytest.raises(ValueError):
partition_md()
def test_partition_md_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
with open(filename) as f:
text = f.read()
with pytest.raises(ValueError):
partition_md(filename=filename, text=text)

View File

@ -0,0 +1,110 @@
[
{
"element_id": "6f348994832b2ad6127af4f7f1736f67",
"text": "Downloadify: Client Side File Creation",
"type": "Title",
"metadata": {}
},
{
"element_id": "074ac796e8f463c50a5d2ec4d047a5b7",
"text": "JavaScript + Flash Library",
"type": "Title",
"metadata": {}
},
{
"element_id": "8dc8800e5660b2558bb7f5f5416ca498",
"text": "Copyright (c) 2009 Douglas C. Neiner",
"type": "Title",
"metadata": {}
},
{
"element_id": "eb281d7b00a856779aaca7d1ec5197a7",
"text": "Permission is hereby granted, free of charge, to any person obtaining a copy",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "b41e880594419467436d152970f36710",
"text": "of this software and associated documentation files (the \"Software\"), to deal",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "a9578931575204db7971aa2e85137083",
"text": "in the Software without restriction, including without limitation the rights",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "7105a363bc50eba8e93f676dbb0bd145",
"text": "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "10e76e02d2ddc0fa91590e65249dbbb5",
"text": "copies of the Software, and to permit persons to whom the Software is",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "cb2b93515ca0dd50850fd3e1491bf06c",
"text": "furnished to do so, subject to the following conditions:",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "ace17038b2bfb49c3882a23be243c016",
"text": "The above copyright notice and this permission notice shall be included in",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "4d1f5dcef281e3f580a6c6156a298960",
"text": "all copies or substantial portions of the Software.",
"type": "Title",
"metadata": {}
},
{
"element_id": "58dab889725677ddc5a270a07df8395e",
"text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "b47e700b9d4e04e4670448bb39067ed2",
"text": "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "ff5d35f4e0324c8499b81980c7da4b7c",
"text": "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "6dc498cbd6e27db10da2431cfcc32e90",
"text": "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "cb64ba82bcfdc75c8d68da657159e00d",
"text": "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,",
"type": "NarrativeText",
"metadata": {}
},
{
"element_id": "ed18e41c2aa38a20e0c256fdc28b7243",
"text": "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN",
"type": "UncategorizedText",
"metadata": {}
},
{
"element_id": "76d8377ccb0743b6c7de1f85b60f3955",
"text": "THE SOFTWARE.",
"type": "Title",
"metadata": {}
}
]

View File

@ -0,0 +1,50 @@
[
{
"element_id": "56a9f768a0968be676f9addd5ec3032e",
"text": "Downloadify Example",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "d551bbfc9477547e4dce6264d8196c7b",
"text": "More info available at the Github Project Page",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "971b974235a86ca628dcc713d6e2e8d9",
"text": "Filename",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
"type": "UncategorizedText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "53a4db70c6d40ed5206711ed8a255e03",
"text": "You must have Flash 10 installed to download this file.",
"type": "UncategorizedText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "839973fba0c850f1729fad098b031203",
"text": "Downloadify Invoke Script For This Page",
"type": "Title",
"metadata": {
"page_number": 1
}
}
]

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
if ! diff -ru github-downloadify-output test_unstructured_ingest/expected-structured-output/github-downloadify ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, copy the outputs from"
echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
echo
echo " PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
echo
exit 1
fi

View File

@ -6,3 +6,4 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
./test_unstructured_ingest/test-ingest-s3.sh
./test_unstructured_ingest/test-ingest-github.sh

View File

@ -1 +1 @@
__version__ = "0.4.16-dev4" # pragma: no cover
__version__ = "0.4.16-dev5" # pragma: no cover

View File

@ -38,6 +38,11 @@ PPT_MIME_TYPES = [
"application/vnd.ms-powerpoint",
]
MD_MIME_TYPES = [
"text/markdown",
"text/x-markdown",
]
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
# looking for expected filenames within the zip file.
@ -83,6 +88,7 @@ class FileType(Enum):
# Markup Types
HTML = 50
XML = 51
MD = 52
# Compressed Types
ZIP = 60
@ -102,6 +108,7 @@ EXT_TO_FILETYPE = {
".eml": FileType.EML,
".xml": FileType.XML,
".html": FileType.HTML,
".md": FileType.MD,
".xlsx": FileType.XLSX,
".pptx": FileType.PPTX,
".png": FileType.PNG,
@ -160,16 +167,18 @@ def detect_filetype(
elif mime_type == "image/png":
return FileType.PNG
elif mime_type in MD_MIME_TYPES:
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
return FileType.MD
elif mime_type == "text/plain":
if extension and extension == ".eml":
return FileType.EML
if file and not extension:
if _check_eml_from_buffer(file=file) is True:
return FileType.EML
else:
return FileType.TXT
else:
return FileType.TXT
if extension and extension == ".md":
return FileType.MD
if file and not extension and _check_eml_from_buffer(file=file) is True:
return FileType.EML
return FileType.TXT
elif mime_type.endswith("xml"):
if extension and extension == ".html":

View File

@ -0,0 +1,201 @@
import fnmatch
import json
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from urllib.parse import urlparse
import requests
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
)
if TYPE_CHECKING:
from github.Repository import Repository
@dataclass
class SimpleGitHubConfig(BaseConnectorConfig):
github_url: str
github_access_token: Optional[str]
github_branch: Optional[str]
github_file_glob: Optional[str]
# Standard Connector options
download_dir: str
# where to write structured data, with the directory structure matching the github repository
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
verbose: bool = False
repo_owner: str = field(init=False, repr=False)
repo_name: str = field(init=False, repr=False)
def __post_init__(self):
parsed_gh_url = urlparse(self.github_url)
path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment]
# If a scheme and netloc are provided, ensure they are correct
# Additionally, ensure that the path contains two fragments
if (
(parsed_gh_url.scheme and parsed_gh_url.scheme != "https")
or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com")
or len(path_fragments) != 2
):
raise ValueError(
'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"'
' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".',
)
# If there's no issues, store the core repository info
self.repo_owner = path_fragments[0]
self.repo_name = path_fragments[1]
@dataclass
class GitHubIngestDoc(BaseIngestDoc):
config: SimpleGitHubConfig = field(repr=False)
repo: "Repository"
path: str
@property
def filename(self):
return (Path(self.config.download_dir) / self.path).resolve()
def _output_filename(self):
return Path(self.config.output_dir) / f"{self.path}.json"
def _create_full_tmp_dir_path(self):
"""includes directories in in the github repository"""
self.filename.parent.mkdir(parents=True, exist_ok=True)
def cleanup_file(self):
"""Removes the local copy the file (or anything else) after successful processing."""
if not self.config.preserve_downloads:
if self.config.verbose:
print(f"cleaning up {self}")
os.unlink(self.filename)
def get_file(self):
"""Fetches the "remote" doc and stores it locally on the filesystem."""
self._create_full_tmp_dir_path()
if not self.config.re_download and self.filename.is_file() and self.filename.stat():
if self.config.verbose:
print(f"File exists: {self.filename}, skipping download")
return
if self.config.verbose:
print(f"fetching {self} - PID: {os.getpid()}")
content_file = self.repo.get_contents(self.path)
contents = b""
if (
not content_file.content # type: ignore
and content_file.encoding == "none" # type: ignore
and content_file.size # type: ignore
):
print("File too large for the GitHub API, using direct download link instead.")
response = requests.get(content_file.download_url) # type: ignore
if response.status_code != 200:
print("Direct download link has failed... Skipping this file.")
else:
contents = response.content
else:
contents = content_file.decoded_content # type: ignore
with open(self.filename, "wb") as f:
f.write(contents)
def has_output(self):
"""Determine if structured output for this doc already exists."""
output_filename = self._output_filename()
return output_filename.is_file() and output_filename.stat()
def write_result(self):
"""Write the structured json result for this doc. result must be json serializable."""
output_filename = self._output_filename()
output_filename.parent.mkdir(parents=True, exist_ok=True)
with open(output_filename, "w", encoding="utf8") as output_f:
json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
print(f"Wrote {output_filename}")
class GitHubConnector(BaseConnector):
def __init__(self, config: SimpleGitHubConfig):
from github import Github
self.config = config
self.github = Github(self.config.github_access_token)
self.cleanup_files = not config.preserve_downloads
def cleanup(self, cur_dir=None):
if not self.cleanup_files:
return
if cur_dir is None:
cur_dir = self.config.download_dir
sub_dirs = os.listdir(cur_dir)
os.chdir(cur_dir)
for sub_dir in sub_dirs:
# don't traverse symlinks, not that there every should be any
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
self.cleanup(sub_dir)
os.chdir("..")
if len(os.listdir(cur_dir)) == 0:
os.rmdir(cur_dir)
def initialize(self):
pass
def is_file_type_supported(self, path: str) -> bool:
# Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
# TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
supported = path.endswith(
(
".md",
".txt",
".pdf",
".doc",
".docx",
".eml",
".html",
".png",
".jpg",
".ppt",
".pptx",
".xml",
),
)
if not supported and self.config.verbose:
print(f"The file {path!r} is discarded as it does not contain a supported filetype.")
return supported
def does_path_match_glob(self, path: str) -> bool:
if not self.config.github_file_glob:
return True
patterns = self.config.github_file_glob.split(",")
for pattern in patterns:
if fnmatch.filter([path], pattern):
return True
if self.config.verbose:
print(f"The file {path!r} is discarded as it does not match any given glob.")
return False
def get_ingest_docs(self):
repo = self.github.get_repo(f"{self.config.repo_owner}/{self.config.repo_name}")
# Load the Git tree with all files, and then create Ingest docs
# for all blobs, i.e. all files, ignoring directories
sha = self.config.github_branch or repo.default_branch
git_tree = repo.get_git_tree(sha, recursive=True)
return [
GitHubIngestDoc(self.config, repo, element.path)
for element in git_tree.tree
if element.type == "blob"
and self.is_file_type_supported(element.path)
and (not self.config.github_file_glob or self.does_path_match_glob(element.path))
]

View File

@ -24,8 +24,6 @@ class SimpleS3Config(BaseConnectorConfig):
output_dir: str
re_download: bool = False
preserve_downloads: bool = False
# if a structured output .json file already exists, do not reprocess an s3 file to overwrite it
reprocess: bool = False
verbose: bool = False
# S3 Specific (optional)

View File

@ -46,8 +46,6 @@ class BaseConnectorConfig(ABC):
# where to write structured data outputs
output_dir: str
re_download: bool = False
# if a structured output .json file already exists for a given doc, do not reprocess
reprocess: bool = False
verbose: bool = False
@ -96,7 +94,7 @@ class BaseIngestDoc(ABC):
self.isd_elems_no_filename = []
for elem in isd_elems:
# type: ignore
elem["metadata"].pop("filename") # type: ignore[attr-defined]
elem["metadata"].pop("filename", None) # type: ignore[attr-defined]
elem.pop("coordinates") # type: ignore[attr-defined]
self.isd_elems_no_filename.append(elem)

View File

@ -6,6 +6,7 @@ import sys
import click
from unstructured.ingest.connector.github import GitHubConnector, SimpleGitHubConfig
from unstructured.ingest.connector.reddit import RedditConnector, SimpleRedditConfig
from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
from unstructured.ingest.doc_processor.generalized import initialize, process_document
@ -79,6 +80,29 @@ class MainProcess:
default=False,
help="Connect to s3 without local AWS credentials.",
)
@click.option(
"--github-url",
default=None,
help='URL to GitHub repository, e.g. "https://github.com/Unstructured-IO/unstructured",'
' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured"',
)
@click.option(
"--github-access-token",
default=None,
help="A GitHub access token, see https://docs.github.com/en/authentication",
)
@click.option(
"--github-branch",
default=None,
help="The branch for which to fetch files from. If not given,"
" the default repository branch is used.",
)
@click.option(
"--github-file-glob",
default=None,
help="A comma-separated list of file globs to limit which types of files are accepted,"
" e.g. '*.html,*.txt'",
)
@click.option(
"--subreddit-name",
default=None,
@ -148,6 +172,10 @@ class MainProcess:
@click.option("-v", "--verbose", is_flag=True, default=False)
def main(
s3_url,
github_url,
github_access_token,
github_branch,
github_file_glob,
subreddit_name,
reddit_client_id,
reddit_client_secret,
@ -182,6 +210,21 @@ def main(
verbose=verbose,
),
)
elif github_url:
doc_connector = GitHubConnector( # type: ignore
config=SimpleGitHubConfig(
github_url=github_url,
github_access_token=github_access_token,
github_branch=github_branch,
github_file_glob=github_file_glob,
# defaults params:
download_dir=download_dir,
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
verbose=verbose,
),
)
elif subreddit_name:
doc_connector = RedditConnector( # type: ignore
config=SimpleRedditConfig(

View File

@ -6,6 +6,7 @@ from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.image import partition_image
from unstructured.partition.md import partition_md
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
@ -44,6 +45,8 @@ def partition(
return partition_email(filename=filename, file=file)
elif filetype == FileType.HTML:
return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.MD:
return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.PDF:
return partition_pdf(
filename=filename, # type: ignore

View File

@ -0,0 +1,60 @@
from typing import IO, List, Optional, Union
import markdown
import requests
from unstructured.documents.elements import Element
from unstructured.documents.xml import VALID_PARSERS
from unstructured.partition.html import partition_html
def optional_decode(contents: Union[str, bytes]) -> str:
if isinstance(contents, bytes):
return contents.decode("utf-8")
return contents
def partition_md(
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
url: Optional[str] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
parser: VALID_PARSERS = None,
) -> List[Element]:
if not any([filename, file, text, url]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text and not url:
with open(filename, encoding="utf8") as f:
text = optional_decode(f.read())
elif file is not None and not filename and not text and not url:
text = optional_decode(file.read())
elif text is not None and not filename and not file and not url:
pass
elif url is not None and not filename and not file and not text:
response = requests.get(url)
if not response.ok:
raise ValueError(f"URL return an error: {response.status_code}")
content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/markdown"):
raise ValueError(f"Expected content type text/markdown. Got {content_type}.")
text = response.text
else:
raise ValueError("Only one of filename, file, or text can be specified.")
html = markdown.markdown(text)
return partition_html(
text=html,
include_page_breaks=include_page_breaks,
include_metadata=include_metadata,
parser=parser,
)