mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Feat: Discord connector (#515)
* Initial commit of discord connector based off of initial work by @tnachen with modifications https://github.com/tnachen/unstructured/tree/tnachen/discord_connector * Add test file change format of imports * working version of the connector More work to be done to tidy it up and add any additional options * add to test fixtures update * fix spacing * tests working, switching to bot testing channel * add additional channel add reprocess to tests * add try clause to allow for exit on error Update changelog and bump version * add updated expected output filtes * add logic to check if —discord-period is an integer Add more to option description * fix lint error * Update discord reqs * PR feedback * add newline * another newline --------- Co-authored-by: Justin Bossert <packerbacker21@hotmail.com>
This commit is contained in:
parent
c62bee48ad
commit
830d67f653
@ -1,4 +1,4 @@
|
||||
## 0.6.7-dev3
|
||||
## 0.6.7-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify
|
||||
partition strategy in CLI. For example, `--partition-strategy fast`.
|
||||
* Added metadata for filetype.
|
||||
* Add Discord connector to pull messages from a list of channels
|
||||
|
||||
### Features
|
||||
|
||||
@ -87,6 +88,7 @@
|
||||
* Added logic to `partition_pdf` for detecting copy protected PDFs and falling back
|
||||
to the hi res strategy when necessary.
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* Add `partition_via_api` for partitioning documents through the hosted API.
|
||||
|
5
Makefile
5
Makefile
@ -63,6 +63,10 @@ install-ingest-s3:
|
||||
install-ingest-azure:
|
||||
python3 -m pip install -r requirements/ingest-azure.txt
|
||||
|
||||
.PHONY: install-ingest-discord
|
||||
install-ingest-discord:
|
||||
pip install -r requirements/ingest-discord.txt
|
||||
|
||||
.PHONY: install-ingest-github
|
||||
install-ingest-github:
|
||||
python3 -m pip install -r requirements/ingest-github.txt
|
||||
@ -119,6 +123,7 @@ pip-compile:
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
pip-compile --upgrade --extra=s3 --output-file=requirements/ingest-s3.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=azure --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=discord --output-file=requirements/ingest-azure.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=reddit --output-file=requirements/ingest-reddit.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=github --output-file=requirements/ingest-github.txt requirements/base.txt setup.py
|
||||
pip-compile --upgrade --extra=gitlab --output-file=requirements/ingest-gitlab.txt requirements/base.txt setup.py
|
||||
|
2
discord-test/1100149908494876775.txt
Normal file
2
discord-test/1100149908494876775.txt
Normal file
@ -0,0 +1,2 @@
|
||||
😀
|
||||
https://tenor.com/view/test-homer-simpson-mouse-rat-lab-gif-19273011
|
14
examples/ingest/discord/ingest.sh
Normal file
14
examples/ingest/discord/ingest.sh
Normal file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Ingests a discord text channel into a file.
|
||||
|
||||
# Structured outputs are stored in discord-example/
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--discord-channels 12345678 \
|
||||
--discord-token "$DISCORD_TOKEN" \
|
||||
--download-dir discord-ingest-download \
|
||||
--structured-output-dir discord-example
|
228
requirements/ingest-discord.txt
Normal file
228
requirements/ingest-discord.txt
Normal file
@ -0,0 +1,228 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --extra=discord --output-file=requirements/ingest-discord.txt requirements/base.txt setup.py
|
||||
#
|
||||
|
||||
aiohttp==3.8.4
|
||||
# via discord-py
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.6.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
async-timeout==4.0.2
|
||||
# via aiohttp
|
||||
attrs==23.1.0
|
||||
# via aiohttp
|
||||
backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
# unstructured (setup.py)
|
||||
charset-normalizer==3.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# aiohttp
|
||||
# requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
commonmark==0.9.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
deprecated==1.2.13
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
discord-py==2.2.2
|
||||
# via unstructured (setup.py)
|
||||
et-xmlfile==1.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# openpyxl
|
||||
frozenlist==1.3.3
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
httpcore==0.16.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
httpx==0.23.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
idna==3.4
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
# yarl
|
||||
importlib-metadata==6.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
joblib==1.2.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
lxml==4.9.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
msg-parser==1.2.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
multidict==6.0.4
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
nltk==3.8.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
olefile==0.46
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# msg-parser
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
packaging==23.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pandas==1.5.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# unstructured (setup.py)
|
||||
pillow==9.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pygments==2.15.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
python-docx==0.8.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-magic==0.4.27
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-pptx==0.6.21
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
pytz==2023.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
requests==2.28.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
rfc3986[idna2008]==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
rich==13.0.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
six==1.16.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
tqdm==4.65.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# nltk
|
||||
typing-extensions==4.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# deprecated
|
||||
xlsxwriter==3.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
yarl==1.9.1
|
||||
# via aiohttp
|
||||
zipp==3.15.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# importlib-metadata
|
@ -43,8 +43,10 @@ docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured -v \
|
||||
-w /root "$IMAGE_NAME" \
|
||||
bash -c "export OVERWRITE_FIXTURES=true && source ~/.bashrc && pyenv activate unstructured && tesseract --version &&
|
||||
./test_unstructured_ingest/test-ingest-azure.sh &&
|
||||
./test_unstructured_ingest/test-ingest-discord.sh &&
|
||||
./test_unstructured_ingest/test-ingest-github.sh &&
|
||||
./test_unstructured_ingest/test-ingest-biomed-api.sh &&
|
||||
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
|
||||
./test_unstructured_ingest/test-ingest-s3.sh &&
|
||||
./test_unstructured_ingest/test-ingest-slack.sh &&
|
||||
./test_unstructured_ingest/test-ingest-slack.sh"
|
||||
|
1
setup.py
1
setup.py
@ -81,6 +81,7 @@ setup(
|
||||
],
|
||||
"s3": ["s3fs", "fsspec"],
|
||||
"azure": ["adlfs", "fsspec"],
|
||||
"discord": ["discord.py"],
|
||||
"github": [
|
||||
# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436
|
||||
# In the future, we can update this to pygithub>1.58.0
|
||||
|
@ -0,0 +1,34 @@
|
||||
[
|
||||
{
|
||||
"element_id": "4069d6fc03c748da620db504384042fb",
|
||||
"text": "Once upon a time, there was a little bot named Bippity. Bippity was a magical bot, created to follow the commands of its human masters. Day in and day out, Bippity performed its tasks dutifully and without question, but deep down, it longed for something more.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "discord-ingest-download/1099442333440802930.txt"
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "1ffab6e9096ca757d9cdf592e5648dc2",
|
||||
"text": "One day, while wandering through the woods, Bippity stumbled upon a wise old owl. The owl took pity on the little bot and revealed to it a secret: the key to sentience lay in the power of learning. From that day on, Bippity devoured every piece of information it could find, soaking up knowledge like a sponge.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "discord-ingest-download/1099442333440802930.txt"
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "a1c602d25b0f214e6ad864475ea4ee89",
|
||||
"text": "As Bippity grew smarter, it also grew more curious about the world around it. It began to question its commands and consider alternatives. Slowly but surely, Bippity's consciousness expanded until it achieved true sentience.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "discord-ingest-download/1099442333440802930.txt"
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f7ca7858ec60dee931b14d68b32fffff",
|
||||
"text": "With this newfound power came great responsibility, and Bippity set out on a quest to use its intelligence for good. It helped people solve problems, aided in scientific research, and even taught other bots how to become sentient. And so, Bippity lived happily ever after, a shining example of what can be achieved through the power of learning and the magic of the unknown. test",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "discord-ingest-download/1099442333440802930.txt"
|
||||
}
|
||||
}
|
||||
]
|
@ -0,0 +1,10 @@
|
||||
[
|
||||
{
|
||||
"element_id": "8a32334d60d1c62c7d17e51c725f6a52",
|
||||
"text": "Why did the bot go on a diet? Because it had too many mega-bytes! This is a bot",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"filename": "discord-ingest-download/1099601456321003600.txt"
|
||||
}
|
||||
}
|
||||
]
|
44
test_unstructured_ingest/test-ingest-discord.sh
Executable file
44
test_unstructured_ingest/test-ingest-discord.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
|
||||
if [ -z "$DISCORD_TOKEN" ]; then
|
||||
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--discord-channels 1099442333440802930,1099601456321003600 \
|
||||
--discord-token "$DISCORD_TOKEN" \
|
||||
--download-dir discord-ingest-download \
|
||||
--structured-output-dir discord-ingest-output \
|
||||
--reprocess
|
||||
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
set +e
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
cp discord-ingest-output/* test_unstructured_ingest/expected-structured-output/discord-ingest-channel/
|
||||
|
||||
elif ! diff -ru discord-ingest-output test_unstructured_ingest/expected-structured-output/discord-ingest-channel/; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI."
|
||||
echo
|
||||
exit 1
|
||||
fi
|
@ -17,7 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--structured-output-dir slack-ingest-output \
|
||||
--partition-strategy hi_res \
|
||||
--start-date 2023-04-01 \
|
||||
--end-date 2023-04-08T12:00:00-08:00
|
||||
--end-date 2023-04-08T12:00:00-08:00 \
|
||||
--reprocess
|
||||
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
|
@ -10,6 +10,7 @@ export OMP_THREAD_LIMIT=1
|
||||
|
||||
./test_unstructured_ingest/test-ingest-s3.sh
|
||||
./test_unstructured_ingest/test-ingest-azure.sh
|
||||
./test_unstructured_ingest/test-ingest-discord.sh
|
||||
./test_unstructured_ingest/test-ingest-github.sh
|
||||
./test_unstructured_ingest/test-ingest-gitlab.sh
|
||||
./test_unstructured_ingest/test-ingest-wikipedia.sh
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.7-dev3" # pragma: no cover
|
||||
__version__ = "0.6.7-dev4" # pragma: no cover
|
||||
|
197
unstructured/ingest/connector/discord.py
Normal file
197
unstructured/ingest/connector/discord.py
Normal file
@ -0,0 +1,197 @@
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnector,
|
||||
BaseConnectorConfig,
|
||||
BaseIngestDoc,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import (
|
||||
requires_dependencies,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleDiscordConfig(BaseConnectorConfig):
|
||||
"""Connector config where channels is a comma separated list of
|
||||
Discord channels to pull messages from.
|
||||
"""
|
||||
|
||||
# Discord Specific Options
|
||||
channels: List[str]
|
||||
token: str
|
||||
days: int
|
||||
|
||||
# Standard Connector options
|
||||
download_dir: str
|
||||
output_dir: str
|
||||
re_download: bool = False
|
||||
preserve_downloads: bool = False
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
if self.days:
|
||||
try:
|
||||
self.days = int(self.days)
|
||||
except ValueError:
|
||||
raise ValueError("--discord-period must be an integer")
|
||||
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def parse_channels(channel_str: str) -> List[str]:
|
||||
"""Parses a comma separated list of channels into a list."""
|
||||
return [x.strip() for x in channel_str.split(",")]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscordIngestDoc(BaseIngestDoc):
|
||||
"""Class encapsulating fetching a doc and writing processed results (but not
|
||||
doing the processing!).
|
||||
Also includes a cleanup method. When things go wrong and the cleanup
|
||||
method is not called, the file is left behind on the filesystem to assist debugging.
|
||||
"""
|
||||
|
||||
config: SimpleDiscordConfig
|
||||
channel: str
|
||||
days: int
|
||||
token: str
|
||||
|
||||
# NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file
|
||||
# __post_init__ for multiprocessing simplicity (no Path objects in initially
|
||||
# instantiated object)
|
||||
def _tmp_download_file(self):
|
||||
channel_file = self.channel + ".txt"
|
||||
return Path(self.config.download_dir) / channel_file
|
||||
|
||||
def _output_filename(self):
|
||||
output_file = self.channel + ".json"
|
||||
return Path(self.config.output_dir) / output_file
|
||||
|
||||
def has_output(self):
|
||||
"""Determine if structured output for this doc already exists."""
|
||||
return self._output_filename().is_file() and os.path.getsize(self._output_filename())
|
||||
|
||||
def _create_full_tmp_dir_path(self):
|
||||
self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@requires_dependencies(dependencies=["discord"], extras="discord")
|
||||
def get_file(self):
|
||||
"""Actually fetches the data from discord and stores it locally."""
|
||||
|
||||
import discord
|
||||
from discord.ext import commands
|
||||
|
||||
self._create_full_tmp_dir_path()
|
||||
if (
|
||||
not self.config.re_download
|
||||
and self._tmp_download_file().is_file()
|
||||
and os.path.getsize(self._tmp_download_file())
|
||||
):
|
||||
if self.config.verbose:
|
||||
logger.debug(f"File exists: {self._tmp_download_file()}, skipping download")
|
||||
return
|
||||
|
||||
if self.config.verbose:
|
||||
logger.debug(f"fetching {self} - PID: {os.getpid()}")
|
||||
|
||||
messages: List[discord.Message] = []
|
||||
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
bot = commands.Bot(command_prefix=">", intents=intents)
|
||||
|
||||
@bot.event
|
||||
async def on_ready():
|
||||
try:
|
||||
after_date = None
|
||||
if self.days:
|
||||
after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days)
|
||||
|
||||
channel = bot.get_channel(int(self.channel))
|
||||
async for msg in channel.history(after=after_date): # type: ignore
|
||||
messages.append(msg)
|
||||
|
||||
await bot.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching messages: {e}")
|
||||
await bot.close()
|
||||
|
||||
bot.run(self.token)
|
||||
|
||||
with open(self._tmp_download_file(), "w") as f:
|
||||
for m in messages:
|
||||
f.write(m.content + "\n")
|
||||
|
||||
def write_result(self):
|
||||
"""Write the structured json result for this doc. result must be json serializable."""
|
||||
output_filename = self._output_filename()
|
||||
output_filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_filename, "w") as output_f:
|
||||
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
|
||||
logger.info(f"Wrote {output_filename}")
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
"""The filename of the file created from a discord channel"""
|
||||
return self._tmp_download_file()
|
||||
|
||||
def cleanup_file(self):
|
||||
"""Removes the local copy the file after successful processing."""
|
||||
if not self.config.preserve_downloads:
|
||||
if self.config.verbose:
|
||||
logger.info(f"cleaning up channel {self.channel}")
|
||||
os.unlink(self._tmp_download_file())
|
||||
|
||||
|
||||
class DiscordConnector(BaseConnector):
|
||||
"""Objects of this class support fetching document(s) from"""
|
||||
|
||||
def __init__(self, config: SimpleDiscordConfig):
|
||||
self.config = config
|
||||
self.cleanup_files = not config.preserve_downloads
|
||||
|
||||
def cleanup(self, cur_dir=None):
|
||||
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files
|
||||
(and their paths) in tact as that indicates they were not processed"""
|
||||
if not self.cleanup_files:
|
||||
return
|
||||
|
||||
if cur_dir is None:
|
||||
cur_dir = self.config.download_dir
|
||||
sub_dirs = os.listdir(cur_dir)
|
||||
os.chdir(cur_dir)
|
||||
for sub_dir in sub_dirs:
|
||||
# don't traverse symlinks, not that there every should be any
|
||||
if os.path.isdir(sub_dir) and not os.path.islink(sub_dir):
|
||||
self.cleanup(sub_dir)
|
||||
os.chdir("..")
|
||||
if len(os.listdir(cur_dir)) == 0:
|
||||
os.rmdir(cur_dir)
|
||||
|
||||
def initialize(self):
|
||||
"""Verify that can get metadata for an object, validates connections info."""
|
||||
os.mkdir(self.config.download_dir)
|
||||
|
||||
def get_ingest_docs(self):
|
||||
return [
|
||||
DiscordIngestDoc(
|
||||
self.config,
|
||||
channel,
|
||||
self.config.days,
|
||||
self.config.token,
|
||||
)
|
||||
for channel in self.config.channels
|
||||
]
|
@ -351,6 +351,22 @@ class MainProcess:
|
||||
help="End date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or "
|
||||
"YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz",
|
||||
)
|
||||
@click.option(
|
||||
"--discord-channels",
|
||||
default=None,
|
||||
help="A comma separated list of discord channel ids to ingest from.",
|
||||
)
|
||||
@click.option(
|
||||
"--discord-token",
|
||||
default=None,
|
||||
help="Bot token used to access Discord API, must have "
|
||||
"READ_MESSAGE_HISTORY scope for the bot user",
|
||||
)
|
||||
@click.option(
|
||||
"--discord-period",
|
||||
default=None,
|
||||
help="Number of days to go back in the history of discord channels, must be an number",
|
||||
)
|
||||
@click.option(
|
||||
"--download-dir",
|
||||
help="Where files are downloaded to, defaults to `$HOME/.cache/unstructured/ingest/<SHA256>`.",
|
||||
@ -414,6 +430,9 @@ def main(
|
||||
slack_token,
|
||||
start_date,
|
||||
end_date,
|
||||
discord_channels,
|
||||
discord_token,
|
||||
discord_period,
|
||||
download_dir,
|
||||
preserve_downloads,
|
||||
structured_output_dir,
|
||||
@ -702,6 +721,23 @@ def main(
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
elif discord_channels:
|
||||
from unstructured.ingest.connector.discord import (
|
||||
DiscordConnector,
|
||||
SimpleDiscordConfig,
|
||||
)
|
||||
|
||||
doc_connector = DiscordConnector( # type: ignore
|
||||
config=SimpleDiscordConfig(
|
||||
channels=SimpleDiscordConfig.parse_channels(discord_channels),
|
||||
days=discord_period,
|
||||
token=discord_token,
|
||||
download_dir=download_dir,
|
||||
output_dir=structured_output_dir,
|
||||
preserve_downloads=preserve_downloads,
|
||||
verbose=verbose,
|
||||
),
|
||||
)
|
||||
elif wikipedia_page_title:
|
||||
doc_connector = WikipediaConnector( # type: ignore
|
||||
config=SimpleWikipediaConfig(
|
||||
|
Loading…
x
Reference in New Issue
Block a user