mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 28214a6cc3
			
		
	
	
		28214a6cc3
		
			
		
	
	
	
	
		
			
			* Pull out s3 code as subcommand * Pull out dropbox code as subcommand * Pull out azure code as subcommand * Pull out fsspec code as subcommand * Pull out github code as subcommand * Pull out gitlab code as subcommand * Pull out reddit code as subcommand * Pull out slack code as subcommand * Pull out discord code as subcommand * Pull out wikipedia code as subcommand * Pull out gdrive code as subcommand * Pull out biomed code as subcommand * rename parameters * Pull out onedrive code as subcommand * Pull out outlook code as subcommand * Pull out local code as subcommand * Pull out elasticsearch code as subcommand * Pull out confluence code as subcommand * Drop previous main file * update changelog * Add back in mp.Pool * Fix mypy issues with click * Make sure all tests run with verbose flag * refactor approach to dynamically add common options to each subcommand, scrub logging of options for sensitive data * Pull out some more shared options * Support running code via python as well as cli * update ingest readme and move it to the ingest folder * update usage in connector docs * move local command arg in test * Seperate out cli code from logic running unstructured * Make some cli fields required rather than optional * rename process -> processor * Improve logger to avoid duplicate handlers --------- Co-authored-by: Ryan Nikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
		
			
				
	
	
		
			31 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			31 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env bash
 | |
| 
 | |
| # Processes all the documents in all spaces within a confluence domain, using the `unstructured` library.
 | |
| 
 | |
| # Structured outputs are stored in confluence-ingest-output
 | |
| SCRIPT_DIR=$(dirname "$(realpath "$0")")
 | |
| cd "$SCRIPT_DIR"/../../.. || exit 1
 | |
| 
 | |
| # Obtain your authentication variables, save/source them from another file, for security reasons:
 | |
| # source "./../../secrets/confluence.txt"
 | |
| # ...
 | |
| # --user-email "$CONFLUENCE_USER_EMAIL"
 | |
| # --api-token "$CONFLUENCE_API_TOKEN"
 | |
| 
 | |
| # Other arguments that you can use:
 | |
| # --max-num-of-spaces 10
 | |
| #     --> The maximum number of spaces to be ingested. Set as 10 in the example.
 | |
| # --list-of-spaces testteamsp1,testteamsp2
 | |
| #     --> A comma separated list of space ids for the spaces to be ingested.
 | |
| #     --> Avoid using --confluence-list-of-spaces and --confluence-max-num-of-spaces at the same time.
 | |
| # --max-num-of-docs-from-each-space 250 \
 | |
| #     --> The maximum number of documents to be ingested from each space. Set as 250 in the example.
 | |
| PYTHONPATH=. ./unstructured/ingest/main.py \
 | |
|         confluence \
 | |
|         --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
 | |
|         --url https://unstructured-ingest-test.atlassian.net \
 | |
|         --user-email 12345678@unstructured.io \
 | |
|         --api-token ABCDE1234ABDE1234ABCDE1234 \
 | |
|         --structured-output-dir confluence-ingest-output \
 | |
|         --num-processes 2
 |