| 
									
										
										
										
											2023-02-28 09:25:11 +01:00
										 |  |  | #!/usr/bin/env bash
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Processes the Unstructured-IO/unstructured repository | 
					
						
							|  |  |  | # through Unstructured's library in 2 processes. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Structured outputs are stored in wikipedia-ingest-output/ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-11 20:04:15 -05:00
										 |  |  | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) | 
					
						
							| 
									
										
										
										
											2023-02-28 09:25:11 +01:00
										 |  |  | cd "$SCRIPT_DIR"/../../.. || exit 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   wikipedia \
 | 
					
						
							|  |  |  |   --page-title "Open Source Software" \
 | 
					
						
							|  |  |  |   --output-dir wikipedia-ingest-output \
 | 
					
						
							|  |  |  |   --num-processes 2 \
 | 
					
						
							|  |  |  |   --verbose | 
					
						
							| 
									
										
										
										
											2023-02-28 09:25:11 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Alternatively, you can call it using: | 
					
						
							| 
									
										
										
										
											2023-07-31 13:20:10 -04:00
										 |  |  | # unstructured-ingest wikipedia --page-title "..." ... |