| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  | #!/usr/bin/env bash
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Processes all the documents in all spaces within a confluence domain, using the `unstructured` library. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Structured outputs are stored in confluence-ingest-output | 
					
						
							|  |  |  | SCRIPT_DIR=$(dirname "$(realpath "$0")") | 
					
						
							|  |  |  | cd "$SCRIPT_DIR"/../../.. || exit 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Obtain your authentication variables, save/source them from another file, for security reasons: | 
					
						
							|  |  |  | # source "./../../secrets/confluence.txt" | 
					
						
							|  |  |  | # ... | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  | # --user-email "$CONFLUENCE_USER_EMAIL" | 
					
						
							|  |  |  | # --api-token "$CONFLUENCE_API_TOKEN" | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Other arguments that you can use: | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  | # --max-num-of-spaces 10 | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  | #     --> The maximum number of spaces to be ingested. Set as 10 in the example. | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  | # --list-of-spaces testteamsp1,testteamsp2 | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  | #     --> A comma separated list of space ids for the spaces to be ingested. | 
					
						
							|  |  |  | #     --> Avoid using --confluence-list-of-spaces and --confluence-max-num-of-spaces at the same time. | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  | # --max-num-of-docs-from-each-space 250 \ | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  | #     --> The maximum number of documents to be ingested from each space. Set as 250 in the example. | 
					
						
							|  |  |  | PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  |         confluence \
 | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  |         --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
 | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  |         --url https://unstructured-ingest-test.atlassian.net \
 | 
					
						
							|  |  |  |         --user-email 12345678@unstructured.io \
 | 
					
						
							|  |  |  |         --api-token ABCDE1234ABDE1234ABCDE1234 \
 | 
					
						
							| 
									
										
										
										
											2023-09-11 11:40:56 -04:00
										 |  |  |         --output-dir confluence-ingest-output \
 | 
					
						
							| 
									
										
										
										
											2023-07-18 19:29:41 +01:00
										 |  |  |         --num-processes 2 |