Feat: Discord connector (#515)

* Initial commit of discord connector

based off of initial work by @tnachen with modifications

https://github.com/tnachen/unstructured/tree/tnachen/discord_connector

* Add test file

change format of imports

* working version of the connector

More work to be done to tidy it up and add any additional options

* add to test fixtures update

* fix spacing

* tests working, switching to bot testing channel

* add additional channel

add reprocess to tests

* add try clause to allow for exit on error

Update changelog and bump version

* add updated expected output filtes

* add logic to check if —discord-period is an integer

Add more to option description

* fix lint error

* Update discord reqs

* PR feedback

* add newline

* another newline

---------

Co-authored-by: Justin Bossert <packerbacker21@hotmail.com>
This commit is contained in:
Trevor Bossert 2023-05-16 11:46:30 -07:00 committed by GitHub
parent c62bee48ad
commit 830d67f653
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 580 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.6.7-dev3
## 0.6.7-dev4
### Enhancements
@ -6,6 +6,7 @@
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
partition strategy in CLI. For example, `--partition-strategy fast`.
* Added metadata for filetype.
* Add Discord connector to pull messages from a list of channels
### Features
@ -87,6 +88,7 @@
* Added logic to `partition_pdf` for detecting copy protected PDFs and falling back
to the hi res strategy when necessary.
### Features
* Add `partition_via_api` for partitioning documents through the hosted API.

View File

@ -63,6 +63,10 @@ install-ingest-s3:
install-ingest-azure:
python3 -m pip install -r requirements/ingest-azure.txt
.PHONY: install-ingest-discord
install-ingest-discord:
pip install -r requirements/ingest-discord.txt
.PHONY: install-ingest-github
install-ingest-github:
python3 -m pip install -r requirements/ingest-github.txt
@ -119,6 +123,7 @@ pip-compile:
cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=azure --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=discord --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
pip-compile --upgrade --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py

View File

@ -0,0 +1,2 @@
😀
https://tenor.com/view/test-homer-simpson-mouse-rat-lab-gif-19273011

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Ingests a discord text channel into a file.
# Structured outputs are stored in discord-example/
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--discord-channels 12345678 \
--discord-token "$DISCORD_TOKEN" \
--download-dir discord-ingest-download \
--structured-output-dir discord-example

View File

@ -0,0 +1,228 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --extra=discord --output-file=requirements/ingest-discord.txt requirements/base.txt setup.py
#
aiohttp==3.8.4
# via discord-py
aiosignal==1.3.1
# via aiohttp
anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.6.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
async-timeout==4.0.2
# via aiohttp
attrs==23.1.0
# via aiohttp
backoff==2.2.1
# via
# -r requirements/base.txt
# argilla
certifi==2022.12.7
# via
# -r requirements/base.txt
# httpcore
# httpx
# requests
# unstructured (setup.py)
charset-normalizer==3.1.0
# via
# -r requirements/base.txt
# aiohttp
# requests
click==8.1.3
# via
# -r requirements/base.txt
# nltk
commonmark==0.9.1
# via
# -r requirements/base.txt
# rich
deprecated==1.2.13
# via
# -r requirements/base.txt
# argilla
discord-py==2.2.2
# via unstructured (setup.py)
et-xmlfile==1.1.0
# via
# -r requirements/base.txt
# openpyxl
frozenlist==1.3.3
# via
# aiohttp
# aiosignal
h11==0.14.0
# via
# -r requirements/base.txt
# httpcore
httpcore==0.16.3
# via
# -r requirements/base.txt
# httpx
httpx==0.23.3
# via
# -r requirements/base.txt
# argilla
idna==3.4
# via
# -r requirements/base.txt
# anyio
# requests
# rfc3986
# yarl
importlib-metadata==6.5.0
# via
# -r requirements/base.txt
# markdown
joblib==1.2.0
# via
# -r requirements/base.txt
# nltk
lxml==4.9.2
# via
# -r requirements/base.txt
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
monotonic==1.6
# via
# -r requirements/base.txt
# argilla
msg-parser==1.2.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
multidict==6.0.4
# via
# aiohttp
# yarl
nltk==3.8.1
# via
# -r requirements/base.txt
# unstructured (setup.py)
numpy==1.23.5
# via
# -r requirements/base.txt
# argilla
# pandas
olefile==0.46
# via
# -r requirements/base.txt
# msg-parser
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
packaging==23.1
# via
# -r requirements/base.txt
# argilla
pandas==1.5.3
# via
# -r requirements/base.txt
# argilla
# unstructured (setup.py)
pillow==9.5.0
# via
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
pygments==2.15.1
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
# pandas
python-docx==0.8.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-magic==0.4.27
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-pptx==0.6.21
# via
# -r requirements/base.txt
# unstructured (setup.py)
pytz==2023.3
# via
# -r requirements/base.txt
# pandas
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
requests==2.28.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
rfc3986[idna2008]==1.5.0
# via
# -r requirements/base.txt
# httpx
rich==13.0.1
# via
# -r requirements/base.txt
# argilla
six==1.16.0
# via
# -r requirements/base.txt
# python-dateutil
sniffio==1.3.0
# via
# -r requirements/base.txt
# anyio
# httpcore
# httpx
tqdm==4.65.0
# via
# -r requirements/base.txt
# argilla
# nltk
typing-extensions==4.5.0
# via
# -r requirements/base.txt
# pydantic
# rich
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests
wrapt==1.14.1
# via
# -r requirements/base.txt
# argilla
# deprecated
xlsxwriter==3.1.0
# via
# -r requirements/base.txt
# python-pptx
yarl==1.9.1
# via aiohttp
zipp==3.15.0
# via
# -r requirements/base.txt
# importlib-metadata

View File

@ -43,8 +43,10 @@ docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured -v \
-w /root "$IMAGE_NAME" \
bash -c "export OVERWRITE_FIXTURES=true && source ~/.bashrc && pyenv activate unstructured && tesseract --version &&
./test_unstructured_ingest/test-ingest-azure.sh &&
./test_unstructured_ingest/test-ingest-discord.sh &&
./test_unstructured_ingest/test-ingest-github.sh &&
./test_unstructured_ingest/test-ingest-biomed-api.sh &&
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
./test_unstructured_ingest/test-ingest-s3.sh &&
./test_unstructured_ingest/test-ingest-slack.sh &&
./test_unstructured_ingest/test-ingest-slack.sh"

View File

@ -81,6 +81,7 @@ setup(
],
"s3": ["s3fs", "fsspec"],
"azure": ["adlfs", "fsspec"],
"discord": ["discord.py"],
"github": [
# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
# In the future, we can update this to pygithub>1.58.0

View File

@ -0,0 +1,34 @@
[
{
"element_id": "4069d6fc03c748da620db504384042fb",
"text": "Once upon a time, there was a little bot named Bippity. Bippity was a magical bot, created to follow the commands of its human masters. Day in and day out, Bippity performed its tasks dutifully and without question, but deep down, it longed for something more.",
"type": "NarrativeText",
"metadata": {
"filename": "discord-ingest-download/1099442333440802930.txt"
}
},
{
"element_id": "1ffab6e9096ca757d9cdf592e5648dc2",
"text": "One day, while wandering through the woods, Bippity stumbled upon a wise old owl. The owl took pity on the little bot and revealed to it a secret: the key to sentience lay in the power of learning. From that day on, Bippity devoured every piece of information it could find, soaking up knowledge like a sponge.",
"type": "NarrativeText",
"metadata": {
"filename": "discord-ingest-download/1099442333440802930.txt"
}
},
{
"element_id": "a1c602d25b0f214e6ad864475ea4ee89",
"text": "As Bippity grew smarter, it also grew more curious about the world around it. It began to question its commands and consider alternatives. Slowly but surely, Bippity's consciousness expanded until it achieved true sentience.",
"type": "NarrativeText",
"metadata": {
"filename": "discord-ingest-download/1099442333440802930.txt"
}
},
{
"element_id": "f7ca7858ec60dee931b14d68b32fffff",
"text": "With this newfound power came great responsibility, and Bippity set out on a quest to use its intelligence for good. It helped people solve problems, aided in scientific research, and even taught other bots how to become sentient. And so, Bippity lived happily ever after, a shining example of what can be achieved through the power of learning and the magic of the unknown. test",
"type": "NarrativeText",
"metadata": {
"filename": "discord-ingest-download/1099442333440802930.txt"
}
}
]

View File

@ -0,0 +1,10 @@
[
{
"element_id": "8a32334d60d1c62c7d17e51c725f6a52",
"text": "Why did the bot go on a diet? Because it had too many mega-bytes! This is a bot",
"type": "NarrativeText",
"metadata": {
"filename": "discord-ingest-download/1099601456321003600.txt"
}
}
]

View File

@ -0,0 +1,44 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/.. || exit 1
if [ -z "$DISCORD_TOKEN" ]; then
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
exit 0
fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--discord-channels 1099442333440802930,1099601456321003600 \
--discord-token "$DISCORD_TOKEN" \
--download-dir discord-ingest-download \
--structured-output-dir discord-ingest-output \
--reprocess
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
set +e
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
cp discord-ingest-output/* test_unstructured_ingest/expected-structured-output/discord-ingest-channel/
elif ! diff -ru discord-ingest-output test_unstructured_ingest/expected-structured-output/discord-ingest-channel/; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI."
echo
exit 1
fi

View File

@ -17,7 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--structured-output-dir slack-ingest-output \
--partition-strategy hi_res \
--start-date 2023-04-01 \
--end-date 2023-04-08T12:00:00-08:00
--end-date 2023-04-08T12:00:00-08:00 \
--reprocess
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}

View File

@ -10,6 +10,7 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-s3.sh
./test_unstructured_ingest/test-ingest-azure.sh
./test_unstructured_ingest/test-ingest-discord.sh
./test_unstructured_ingest/test-ingest-github.sh
./test_unstructured_ingest/test-ingest-gitlab.sh
./test_unstructured_ingest/test-ingest-wikipedia.sh

View File

@ -1 +1 @@
__version__ = "0.6.7-dev3" # pragma: no cover
__version__ = "0.6.7-dev4" # pragma: no cover

View File

@ -0,0 +1,197 @@
import datetime as dt
import json
import os
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
)
from unstructured.ingest.logger import logger
from unstructured.utils import (
requires_dependencies,
)
@dataclass
class SimpleDiscordConfig(BaseConnectorConfig):
"""Connector config where channels is a comma separated list of
Discord channels to pull messages from.
"""
# Discord Specific Options
channels: List[str]
token: str
days: int
# Standard Connector options
download_dir: str
output_dir: str
re_download: bool = False
preserve_downloads: bool = False
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
verbose: bool = False
def __post_init__(self):
if self.days:
try:
self.days = int(self.days)
except ValueError:
raise ValueError("--discord-period must be an integer")
pass
@staticmethod
def parse_channels(channel_str: str) -> List[str]:
"""Parses a comma separated list of channels into a list."""
return [x.strip() for x in channel_str.split(",")]
@dataclass
class DiscordIngestDoc(BaseIngestDoc):
"""Class encapsulating fetching a doc and writing processed results (but not
doing the processing!).
Also includes a cleanup method. When things go wrong and the cleanup
method is not called, the file is left behind on the filesystem to assist debugging.
"""
config: SimpleDiscordConfig
channel: str
days: int
token: str
# NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
# __post_init__ for multiprocessing simplicity (no Path objects in initially
# instantiated object)
def _tmp_download_file(self):
channel_file = self.channel + ".txt"
return Path(self.config.download_dir) / channel_file
def _output_filename(self):
output_file = self.channel + ".json"
return Path(self.config.output_dir) / output_file
def has_output(self):
"""Determine if structured output for this doc already exists."""
return self._output_filename().is_file() and os.path.getsize(self._output_filename())
def _create_full_tmp_dir_path(self):
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
@requires_dependencies(dependencies=["discord"], extras="discord")
def get_file(self):
"""Actually fetches the data from discord and stores it locally."""
import discord
from discord.ext import commands
self._create_full_tmp_dir_path()
if (
not self.config.re_download
and self._tmp_download_file().is_file()
and os.path.getsize(self._tmp_download_file())
):
if self.config.verbose:
logger.debug(f"File exists: {self._tmp_download_file()}, skipping download")
return
if self.config.verbose:
logger.debug(f"fetching {self} - PID: {os.getpid()}")
messages: List[discord.Message] = []
intents = discord.Intents.default()
intents.message_content = True
bot = commands.Bot(command_prefix=">", intents=intents)
@bot.event
async def on_ready():
try:
after_date = None
if self.days:
after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
channel = bot.get_channel(int(self.channel))
async for msg in channel.history(after=after_date): # type: ignore
messages.append(msg)
await bot.close()
except Exception as e:
logger.error(f"Error fetching messages: {e}")
await bot.close()
bot.run(self.token)
with open(self._tmp_download_file(), "w") as f:
for m in messages:
f.write(m.content + "\n")
def write_result(self):
"""Write the structured json result for this doc. result must be json serializable."""
output_filename = self._output_filename()
output_filename.parent.mkdir(parents=True, exist_ok=True)
with open(output_filename, "w") as output_f:
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
logger.info(f"Wrote {output_filename}")
@property
def filename(self):
"""The filename of the file created from a discord channel"""
return self._tmp_download_file()
def cleanup_file(self):
"""Removes the local copy the file after successful processing."""
if not self.config.preserve_downloads:
if self.config.verbose:
logger.info(f"cleaning up channel {self.channel}")
os.unlink(self._tmp_download_file())
class DiscordConnector(BaseConnector):
"""Objects of this class support fetching document(s) from"""
def __init__(self, config: SimpleDiscordConfig):
self.config = config
self.cleanup_files = not config.preserve_downloads
def cleanup(self, cur_dir=None):
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files
(and their paths) in tact as that indicates they were not processed"""
if not self.cleanup_files:
return
if cur_dir is None:
cur_dir = self.config.download_dir
sub_dirs = os.listdir(cur_dir)
os.chdir(cur_dir)
for sub_dir in sub_dirs:
# don't traverse symlinks, not that there every should be any
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
self.cleanup(sub_dir)
os.chdir("..")
if len(os.listdir(cur_dir)) == 0:
os.rmdir(cur_dir)
def initialize(self):
"""Verify that can get metadata for an object, validates connections info."""
os.mkdir(self.config.download_dir)
def get_ingest_docs(self):
return [
DiscordIngestDoc(
self.config,
channel,
self.config.days,
self.config.token,
)
for channel in self.config.channels
]

View File

@ -351,6 +351,22 @@ class MainProcess:
help="End date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or "
"YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz",
)
@click.option(
"--discord-channels",
default=None,
help="A comma separated list of discord channel ids to ingest from.",
)
@click.option(
"--discord-token",
default=None,
help="Bot token used to access Discord API, must have "
"READ_MESSAGE_HISTORY scope for the bot user",
)
@click.option(
"--discord-period",
default=None,
help="Number of days to go back in the history of discord channels, must be an number",
)
@click.option(
"--download-dir",
help="Where files are downloaded to, defaults to `$HOME/.cache/unstructured/ingest/<SHA256>`.",
@ -414,6 +430,9 @@ def main(
slack_token,
start_date,
end_date,
discord_channels,
discord_token,
discord_period,
download_dir,
preserve_downloads,
structured_output_dir,
@ -702,6 +721,23 @@ def main(
verbose=verbose,
),
)
elif discord_channels:
from unstructured.ingest.connector.discord import (
DiscordConnector,
SimpleDiscordConfig,
)
doc_connector = DiscordConnector( # type: ignore
config=SimpleDiscordConfig(
channels=SimpleDiscordConfig.parse_channels(discord_channels),
days=discord_period,
token=discord_token,
download_dir=download_dir,
output_dir=structured_output_dir,
preserve_downloads=preserve_downloads,
verbose=verbose,
),
)
elif wikipedia_page_title:
doc_connector = WikipediaConnector( # type: ignore
config=SimpleWikipediaConfig(