mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
feat: Add Wikipedia ingest connector (#299)
The connector can process a Wikipedia page and output the HTML, the plain text contents, and the summary. No API key required Also add test case verifying that 3 files are indeed created (one for HTML, one for text, one for the summary).
This commit is contained in:
parent
a74d389fa7
commit
54a6db1c2c
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -109,6 +109,7 @@ jobs:
|
||||
make check-coverage
|
||||
make install-ingest-s3
|
||||
make install-ingest-github
|
||||
make install-ingest-wikipedia
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
changelog:
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
## 0.4.17-dev0
|
||||
## 0.4.17-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* Added Wikipedia connector for ingest cli.
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix `process_document` file cleaning on failure
|
||||
|
||||
11
Makefile
11
Makefile
@ -62,6 +62,10 @@ install-ingest-github:
|
||||
install-ingest-reddit:
|
||||
pip install -r requirements/ingest-reddit.txt
|
||||
|
||||
.PHONY: install-ingest-wikipedia
|
||||
install-ingest-wikipedia:
|
||||
pip install -r requirements/ingest-wikipedia.txt
|
||||
|
||||
.PHONY: install-unstructured-inference
|
||||
install-unstructured-inference:
|
||||
pip install -r requirements/local-inference.txt
|
||||
@ -90,9 +94,10 @@ pip-compile:
|
||||
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
|
||||
# sphinx docs looks for additional requirements
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py
|
||||
|
||||
## install-project-local: install unstructured into your local python environment
|
||||
.PHONY: install-project-local
|
||||
|
||||
18
examples/ingest/wikipedia/ingest.sh
Executable file
18
examples/ingest/wikipedia/ingest.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Processes the Unstructured-IO/unstructured repository
|
||||
# through Unstructured's library in 2 processes.
|
||||
|
||||
# Structured outputs are stored in wikipedia-ingest-output/
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--wikipedia-page-title "Open Source Software" \
|
||||
--structured-output-dir wikipedia-ingest-output \
|
||||
--num-processes 2 \
|
||||
--verbose
|
||||
|
||||
# Alternatively, you can call it using:
|
||||
# unstructured-ingest --wikipedia-page-title "..." ...
|
||||
179
requirements/ingest-wikipedia.txt
Normal file
179
requirements/ingest-wikipedia.txt
Normal file
@ -0,0 +1,179 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py
|
||||
#
|
||||
anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.3.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
beautifulsoup4==4.11.2
|
||||
# via wikipedia
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
# unstructured (setup.py)
|
||||
charset-normalizer==3.0.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# click
|
||||
# tqdm
|
||||
deprecated==1.2.13
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
et-xmlfile==1.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# openpyxl
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
httpcore==0.16.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
httpx==0.23.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
idna==3.4
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
joblib==1.2.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
lxml==4.9.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
nltk==3.8.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
packaging==23.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pandas==1.5.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# unstructured (setup.py)
|
||||
pillow==9.4.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.4
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
python-docx==0.8.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-magic==0.4.27
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-pptx==0.6.21
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
requests==2.28.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
# wikipedia
|
||||
rfc3986[idna2008]==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
six==1.16.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
soupsieve==2.4
|
||||
# via beautifulsoup4
|
||||
tqdm==4.64.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# nltk
|
||||
typing-extensions==4.4.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
urllib3==1.26.14
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
wikipedia==1.4.0
|
||||
# via unstructured (setup.py)
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# deprecated
|
||||
xlsxwriter==3.0.8
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
1
setup.py
1
setup.py
@ -84,6 +84,7 @@ setup(
|
||||
"pygithub==1.57.0",
|
||||
],
|
||||
"reddit": ["praw"],
|
||||
"wikipedia": ["wikipedia"],
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
package_data={"unstructured": ["nlp/*.txt"]},
|
||||
|
||||
16
test_unstructured_ingest/test-ingest-wikipedia.sh
Executable file
16
test_unstructured_ingest/test-ingest-wikipedia.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--wikipedia-page-title "Open Source Software" \
|
||||
--structured-output-dir wikipedia-ingest-output \
|
||||
--num-processes 2 \
|
||||
--verbose
|
||||
|
||||
if [ "$(find 'wikipedia-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
|
||||
echo
|
||||
echo "3 files should have been created."
|
||||
exit 1
|
||||
fi
|
||||
@ -7,3 +7,4 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
./test_unstructured_ingest/test-ingest-s3.sh
|
||||
./test_unstructured_ingest/test-ingest-github.sh
|
||||
./test_unstructured_ingest/test-ingest-wikipedia.sh
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.17-dev0" # pragma: no cover
|
||||
__version__ = "0.4.17-dev1" # pragma: no cover
|
||||
|
||||
@ -84,6 +84,6 @@ class XMLDocument(Document):
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
|
||||
with open(filename, "r+") as f:
|
||||
with open(filename, "r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
|
||||
|
||||
163
unstructured/ingest/connector/wikipedia.py
Normal file
163
unstructured/ingest/connector/wikipedia.py
Normal file
@ -0,0 +1,163 @@
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from wikipedia import WikipediaPage
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleWikipediaConfig(BaseConnectorConfig):
|
||||
title: str
|
||||
|
||||
# Standard Connector options
|
||||
download_dir: str
|
||||
# where to write structured data
|
||||
output_dir: str
|
||||
preserve_downloads: bool = False
|
||||
re_download: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class WikipediaIngestDoc(BaseIngestDoc):
|
||||
config: SimpleWikipediaConfig = field(repr=False)
|
||||
page: "WikipediaPage"
|
||||
|
||||
@property
|
||||
def filename(self) -> Path:
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
def _output_filename(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _create_full_tmp_dir_path(self):
|
||||
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_file(self):
|
||||
"""Removes the local copy the file (or anything else) after successful processing."""
|
||||
if not self.config.preserve_downloads:
|
||||
if self.config.verbose:
|
||||
print(f"cleaning up {self}")
|
||||
os.unlink(self.filename)
|
||||
|
||||
def get_file(self):
|
||||
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
||||
self._create_full_tmp_dir_path()
|
||||
if not self.config.re_download and self.filename.is_file() and self.filename.stat():
|
||||
if self.config.verbose:
|
||||
print(f"File exists: {self.filename}, skipping download")
|
||||
return
|
||||
|
||||
if self.config.verbose:
|
||||
print(f"fetching {self} - PID: {os.getpid()}")
|
||||
with open(self.filename, "w", encoding="utf8") as f:
|
||||
f.write(self.text)
|
||||
|
||||
def has_output(self):
|
||||
"""Determine if structured output for this doc already exists."""
|
||||
output_filename = self._output_filename()
|
||||
return output_filename.is_file() and output_filename.stat()
|
||||
|
||||
def write_result(self):
|
||||
"""Write the structured json result for this doc. result must be json serializable."""
|
||||
output_filename = self._output_filename()
|
||||
output_filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_filename, "w", encoding="utf8") as output_f:
|
||||
json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
|
||||
print(f"Wrote {output_filename}")
|
||||
|
||||
|
||||
class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
|
||||
@property
|
||||
def filename(self) -> Path:
|
||||
return (
|
||||
Path(self.config.download_dir) / f"{self.page.title}-{self.page.revision_id}.html"
|
||||
).resolve()
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return self.page.html()
|
||||
|
||||
def _output_filename(self):
|
||||
return Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-html.json"
|
||||
|
||||
|
||||
class WikipediaIngestTextDoc(WikipediaIngestDoc):
|
||||
@property
|
||||
def filename(self) -> Path:
|
||||
return (
|
||||
Path(self.config.download_dir) / f"{self.page.title}-{self.page.revision_id}.txt"
|
||||
).resolve()
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return self.page.content
|
||||
|
||||
def _output_filename(self):
|
||||
return Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-txt.json"
|
||||
|
||||
|
||||
class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
|
||||
@property
|
||||
def filename(self) -> Path:
|
||||
return (
|
||||
Path(self.config.download_dir)
|
||||
/ f"{self.page.title}-{self.page.revision_id}-summary.txt"
|
||||
).resolve()
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return self.page.summary
|
||||
|
||||
def _output_filename(self):
|
||||
return (
|
||||
Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-summary.json"
|
||||
)
|
||||
|
||||
|
||||
class WikipediaConnector(BaseConnector):
|
||||
def __init__(self, config: SimpleWikipediaConfig):
|
||||
self.config = config
|
||||
self.cleanup_files = not config.preserve_downloads
|
||||
|
||||
def cleanup(self, cur_dir=None):
|
||||
if not self.cleanup_files:
|
||||
return
|
||||
|
||||
if cur_dir is None:
|
||||
cur_dir = self.config.download_dir
|
||||
sub_dirs = os.listdir(cur_dir)
|
||||
os.chdir(cur_dir)
|
||||
for sub_dir in sub_dirs:
|
||||
# don't traverse symlinks, not that there every should be any
|
||||
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
|
||||
self.cleanup(sub_dir)
|
||||
os.chdir("..")
|
||||
if len(os.listdir(cur_dir)) == 0:
|
||||
os.rmdir(cur_dir)
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def get_ingest_docs(self):
|
||||
import wikipedia
|
||||
|
||||
page = wikipedia.page(self.config.title)
|
||||
return [
|
||||
WikipediaIngestTextDoc(self.config, page),
|
||||
WikipediaIngestHTMLDoc(self.config, page),
|
||||
WikipediaIngestSummaryDoc(self.config, page),
|
||||
]
|
||||
@ -9,6 +9,10 @@ import click
|
||||
from unstructured.ingest.connector.github import GitHubConnector, SimpleGitHubConfig
|
||||
from unstructured.ingest.connector.reddit import RedditConnector, SimpleRedditConfig
|
||||
from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
|
||||
from unstructured.ingest.connector.wikipedia import (
|
||||
SimpleWikipediaConfig,
|
||||
WikipediaConnector,
|
||||
)
|
||||
from unstructured.ingest.doc_processor.generalized import initialize, process_document
|
||||
|
||||
|
||||
@ -80,6 +84,11 @@ class MainProcess:
|
||||
default=False,
|
||||
help="Connect to s3 without local AWS credentials.",
|
||||
)
|
||||
@click.option(
|
||||
"--wikipedia-page-title",
|
||||
default=None,
|
||||
help='Title of a Wikipedia page, e.g. "Open source software".',
|
||||
)
|
||||
@click.option(
|
||||
"--github-url",
|
||||
default=None,
|
||||
@ -172,6 +181,7 @@ class MainProcess:
|
||||
@click.option("-v", "--verbose", is_flag=True, default=False)
|
||||
def main(
|
||||
s3_url,
|
||||
wikipedia_page_title,
|
||||
github_url,
|
||||
github_access_token,
|
||||
github_branch,
|
||||
@ -242,6 +252,18 @@ def main(
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
elif wikipedia_page_title:
|
||||
doc_connector = WikipediaConnector( # type: ignore
|
||||
config=SimpleWikipediaConfig(
|
||||
title=wikipedia_page_title,
|
||||
# defaults params:
|
||||
download_dir=download_dir,
|
||||
preserve_downloads=preserve_downloads,
|
||||
output_dir=structured_output_dir,
|
||||
re_download=re_download,
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
# Check for other connector-specific options here and define the doc_connector object
|
||||
# e.g. "elif azure_container: ..."
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user