mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-07 17:12:48 +00:00

Given the tendency for shell scripts to easily enter into a few levels of indentation and long line lengths, update the default to 2 spaces.
31 lines
1.3 KiB
Bash
Executable File
31 lines
1.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Processes all the documents in all spaces within a confluence domain, using the `unstructured` library.
|
|
|
|
# Structured outputs are stored in confluence-ingest-output
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/../../.. || exit 1
|
|
|
|
# Obtain your authentication variables, save/source them from another file, for security reasons:
|
|
# source "./../../secrets/confluence.txt"
|
|
# ...
|
|
# --user-email "$CONFLUENCE_USER_EMAIL"
|
|
# --api-token "$CONFLUENCE_API_TOKEN"
|
|
|
|
# Other arguments that you can use:
|
|
# --max-num-of-spaces 10
|
|
# --> The maximum number of spaces to be ingested. Set as 10 in the example.
|
|
# --list-of-spaces testteamsp1,testteamsp2
|
|
# --> A comma separated list of space ids for the spaces to be ingested.
|
|
# --> Avoid using --confluence-list-of-spaces and --confluence-max-num-of-spaces at the same time.
|
|
# --max-num-of-docs-from-each-space 250 \
|
|
# --> The maximum number of documents to be ingested from each space. Set as 250 in the example.
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
confluence \
|
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
|
--url https://unstructured-ingest-test.atlassian.net \
|
|
--user-email 12345678@unstructured.io \
|
|
--api-token ABCDE1234ABDE1234ABCDE1234 \
|
|
--output-dir confluence-ingest-output \
|
|
--num-processes 2
|