feat: Add Wikipedia ingest connector (#299)

The connector can process a Wikipedia page
and output the HTML,
the plain text contents,
and the summary.
No API key required
Also add test case verifying that 3 files are indeed created (one for HTML, one for text, one for the summary).
This commit is contained in:
Tom Aarsen 2023-02-28 09:25:11 +01:00 committed by GitHub
parent a74d389fa7
commit 54a6db1c2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 414 additions and 6 deletions

View File

@ -109,6 +109,7 @@ jobs:
make check-coverage
make install-ingest-s3
make install-ingest-github
make install-ingest-wikipedia
./test_unstructured_ingest/test-ingest.sh
changelog:

View File

@ -1,9 +1,11 @@
## 0.4.17-dev0
## 0.4.17-dev1
### Enhancements
### Features
* Added Wikipedia connector for ingest cli.
### Fixes
* Fix `process_document` file cleaning on failure

View File

@ -62,6 +62,10 @@ install-ingest-github:
install-ingest-reddit:
pip install -r requirements/ingest-reddit.txt
.PHONY: install-ingest-wikipedia
install-ingest-wikipedia:
pip install -r requirements/ingest-wikipedia.txt
.PHONY: install-unstructured-inference
install-unstructured-inference:
pip install -r requirements/local-inference.txt
@ -90,9 +94,10 @@ pip-compile:
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
# sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py
## install-project-local: install unstructured into your local python environment
.PHONY: install-project-local

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
# Processes the Unstructured-IO/unstructured repository
# through Unstructured's library in 2 processes.
# Structured outputs are stored in wikipedia-ingest-output/
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--wikipedia-page-title "Open Source Software" \
--structured-output-dir wikipedia-ingest-output \
--num-processes 2 \
--verbose
# Alternatively, you can call it using:
# unstructured-ingest --wikipedia-page-title "..." ...

View File

@ -0,0 +1,179 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --extra=wikipedia --output-file=requirements/ingest-wikipedia.txt requirements/base.txt setup.py
#
anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.3.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
backoff==2.2.1
# via
# -r requirements/base.txt
# argilla
beautifulsoup4==4.11.2
# via wikipedia
certifi==2022.12.7
# via
# -r requirements/base.txt
# httpcore
# httpx
# requests
# unstructured (setup.py)
charset-normalizer==3.0.1
# via
# -r requirements/base.txt
# requests
click==8.1.3
# via
# -r requirements/base.txt
# nltk
colorama==0.4.6
# via
# click
# tqdm
deprecated==1.2.13
# via
# -r requirements/base.txt
# argilla
et-xmlfile==1.1.0
# via
# -r requirements/base.txt
# openpyxl
h11==0.14.0
# via
# -r requirements/base.txt
# httpcore
httpcore==0.16.3
# via
# -r requirements/base.txt
# httpx
httpx==0.23.3
# via
# -r requirements/base.txt
# argilla
idna==3.4
# via
# -r requirements/base.txt
# anyio
# requests
# rfc3986
joblib==1.2.0
# via
# -r requirements/base.txt
# nltk
lxml==4.9.2
# via
# -r requirements/base.txt
# python-docx
# python-pptx
# unstructured (setup.py)
monotonic==1.6
# via
# -r requirements/base.txt
# argilla
nltk==3.8.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
numpy==1.23.5
# via
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
packaging==23.0
# via
# -r requirements/base.txt
# argilla
pandas==1.5.3
# via
# -r requirements/base.txt
# argilla
# unstructured (setup.py)
pillow==9.4.0
# via
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pydantic==1.10.4
# via
# -r requirements/base.txt
# argilla
python-dateutil==2.8.2
# via
# -r requirements/base.txt
# pandas
python-docx==0.8.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-magic==0.4.27
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-pptx==0.6.21
# via
# -r requirements/base.txt
# unstructured (setup.py)
pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
# via
# -r requirements/base.txt
# nltk
requests==2.28.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
# wikipedia
rfc3986[idna2008]==1.5.0
# via
# -r requirements/base.txt
# httpx
six==1.16.0
# via
# -r requirements/base.txt
# python-dateutil
sniffio==1.3.0
# via
# -r requirements/base.txt
# anyio
# httpcore
# httpx
soupsieve==2.4
# via beautifulsoup4
tqdm==4.64.1
# via
# -r requirements/base.txt
# argilla
# nltk
typing-extensions==4.4.0
# via
# -r requirements/base.txt
# pydantic
urllib3==1.26.14
# via
# -r requirements/base.txt
# requests
wikipedia==1.4.0
# via unstructured (setup.py)
wrapt==1.14.1
# via
# -r requirements/base.txt
# argilla
# deprecated
xlsxwriter==3.0.8
# via
# -r requirements/base.txt
# python-pptx

View File

@ -84,6 +84,7 @@ setup(
"pygithub==1.57.0",
],
"reddit": ["praw"],
"wikipedia": ["wikipedia"],
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--wikipedia-page-title "Open Source Software" \
--structured-output-dir wikipedia-ingest-output \
--num-processes 2 \
--verbose
if [ "$(find 'wikipedia-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
echo
echo "3 files should have been created."
exit 1
fi

View File

@ -7,3 +7,4 @@ cd "$SCRIPT_DIR"/.. || exit 1
./test_unstructured_ingest/test-ingest-s3.sh
./test_unstructured_ingest/test-ingest-github.sh
./test_unstructured_ingest/test-ingest-wikipedia.sh

View File

@ -1 +1 @@
__version__ = "0.4.17-dev0" # pragma: no cover
__version__ = "0.4.17-dev1" # pragma: no cover

View File

@ -84,6 +84,6 @@ class XMLDocument(Document):
@classmethod
def from_file(cls, filename, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
with open(filename, "r+") as f:
with open(filename, "r+", encoding="utf8") as f:
content = f.read()
return cls.from_string(content, parser=parser, stylesheet=stylesheet)

View File

@ -0,0 +1,163 @@
import json
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
)
if TYPE_CHECKING:
from wikipedia import WikipediaPage
@dataclass
class SimpleWikipediaConfig(BaseConnectorConfig):
title: str
# Standard Connector options
download_dir: str
# where to write structured data
output_dir: str
preserve_downloads: bool = False
re_download: bool = False
verbose: bool = False
@dataclass
class WikipediaIngestDoc(BaseIngestDoc):
config: SimpleWikipediaConfig = field(repr=False)
page: "WikipediaPage"
@property
def filename(self) -> Path:
raise NotImplementedError()
@property
def text(self) -> str:
raise NotImplementedError()
def _output_filename(self):
raise NotImplementedError()
def _create_full_tmp_dir_path(self):
self.filename.parent.mkdir(parents=True, exist_ok=True)
def cleanup_file(self):
"""Removes the local copy the file (or anything else) after successful processing."""
if not self.config.preserve_downloads:
if self.config.verbose:
print(f"cleaning up {self}")
os.unlink(self.filename)
def get_file(self):
"""Fetches the "remote" doc and stores it locally on the filesystem."""
self._create_full_tmp_dir_path()
if not self.config.re_download and self.filename.is_file() and self.filename.stat():
if self.config.verbose:
print(f"File exists: {self.filename}, skipping download")
return
if self.config.verbose:
print(f"fetching {self} - PID: {os.getpid()}")
with open(self.filename, "w", encoding="utf8") as f:
f.write(self.text)
def has_output(self):
"""Determine if structured output for this doc already exists."""
output_filename = self._output_filename()
return output_filename.is_file() and output_filename.stat()
def write_result(self):
"""Write the structured json result for this doc. result must be json serializable."""
output_filename = self._output_filename()
output_filename.parent.mkdir(parents=True, exist_ok=True)
with open(output_filename, "w", encoding="utf8") as output_f:
json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2)
print(f"Wrote {output_filename}")
class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
@property
def filename(self) -> Path:
return (
Path(self.config.download_dir) / f"{self.page.title}-{self.page.revision_id}.html"
).resolve()
@property
def text(self):
return self.page.html()
def _output_filename(self):
return Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-html.json"
class WikipediaIngestTextDoc(WikipediaIngestDoc):
@property
def filename(self) -> Path:
return (
Path(self.config.download_dir) / f"{self.page.title}-{self.page.revision_id}.txt"
).resolve()
@property
def text(self):
return self.page.content
def _output_filename(self):
return Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-txt.json"
class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
@property
def filename(self) -> Path:
return (
Path(self.config.download_dir)
/ f"{self.page.title}-{self.page.revision_id}-summary.txt"
).resolve()
@property
def text(self):
return self.page.summary
def _output_filename(self):
return (
Path(self.config.output_dir) / f"{self.page.title}-{self.page.revision_id}-summary.json"
)
class WikipediaConnector(BaseConnector):
def __init__(self, config: SimpleWikipediaConfig):
self.config = config
self.cleanup_files = not config.preserve_downloads
def cleanup(self, cur_dir=None):
if not self.cleanup_files:
return
if cur_dir is None:
cur_dir = self.config.download_dir
sub_dirs = os.listdir(cur_dir)
os.chdir(cur_dir)
for sub_dir in sub_dirs:
# don't traverse symlinks, not that there every should be any
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
self.cleanup(sub_dir)
os.chdir("..")
if len(os.listdir(cur_dir)) == 0:
os.rmdir(cur_dir)
def initialize(self):
pass
def get_ingest_docs(self):
import wikipedia
page = wikipedia.page(self.config.title)
return [
WikipediaIngestTextDoc(self.config, page),
WikipediaIngestHTMLDoc(self.config, page),
WikipediaIngestSummaryDoc(self.config, page),
]

View File

@ -9,6 +9,10 @@ import click
from unstructured.ingest.connector.github import GitHubConnector, SimpleGitHubConfig
from unstructured.ingest.connector.reddit import RedditConnector, SimpleRedditConfig
from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
from unstructured.ingest.connector.wikipedia import (
SimpleWikipediaConfig,
WikipediaConnector,
)
from unstructured.ingest.doc_processor.generalized import initialize, process_document
@ -80,6 +84,11 @@ class MainProcess:
default=False,
help="Connect to s3 without local AWS credentials.",
)
@click.option(
"--wikipedia-page-title",
default=None,
help='Title of a Wikipedia page, e.g. "Open source software".',
)
@click.option(
"--github-url",
default=None,
@ -172,6 +181,7 @@ class MainProcess:
@click.option("-v", "--verbose", is_flag=True, default=False)
def main(
s3_url,
wikipedia_page_title,
github_url,
github_access_token,
github_branch,
@ -242,6 +252,18 @@ def main(
verbose=verbose,
),
)
elif wikipedia_page_title:
doc_connector = WikipediaConnector( # type: ignore
config=SimpleWikipediaConfig(
title=wikipedia_page_title,
# defaults params:
download_dir=download_dir,
preserve_downloads=preserve_downloads,
output_dir=structured_output_dir,
re_download=re_download,
verbose=verbose,
),
)
# Check for other connector-specific options here and define the doc_connector object
# e.g. "elif azure_container: ..."