| 
									
										
										
										
											2023-12-11 16:29:41 -05:00
										 |  |  | import os | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from unstructured.ingest.connector.local import SimpleLocalConfig | 
					
						
							| 
									
										
										
										
											2024-01-16 12:56:29 -08:00
										 |  |  | from unstructured.ingest.connector.mongodb import SimpleMongoDBConfig | 
					
						
							| 
									
										
										
										
											2023-12-11 16:29:41 -05:00
										 |  |  | from unstructured.ingest.interfaces import ( | 
					
						
							|  |  |  |     ChunkingConfig, | 
					
						
							|  |  |  |     EmbeddingConfig, | 
					
						
							|  |  |  |     PartitionConfig, | 
					
						
							|  |  |  |     ProcessorConfig, | 
					
						
							|  |  |  |     ReadConfig, | 
					
						
							| 
									
										
										
										
											2024-01-16 12:56:29 -08:00
										 |  |  |     WriteConfig, | 
					
						
							| 
									
										
										
										
											2023-12-11 16:29:41 -05:00
										 |  |  | ) | 
					
						
							|  |  |  | from unstructured.ingest.runner import LocalRunner | 
					
						
							|  |  |  | from unstructured.ingest.runner.writers.base_writer import Writer | 
					
						
							|  |  |  | from unstructured.ingest.runner.writers.mongodb import ( | 
					
						
							|  |  |  |     MongodbWriter, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_writer() -> Writer: | 
					
						
							|  |  |  |     return MongodbWriter( | 
					
						
							| 
									
										
										
										
											2024-01-16 12:56:29 -08:00
										 |  |  |         connector_config=SimpleMongoDBConfig( | 
					
						
							| 
									
										
										
										
											2023-12-11 16:29:41 -05:00
										 |  |  |             uri=os.getenv("MONGODB_URI"), | 
					
						
							|  |  |  |             database=os.getenv("MONGODB_DATABASE_NAME"), | 
					
						
							|  |  |  |             collection=os.getenv("DESTINATION_MONGO_COLLECTION"), | 
					
						
							|  |  |  |         ), | 
					
						
							| 
									
										
										
										
											2024-01-16 12:56:29 -08:00
										 |  |  |         write_config=WriteConfig(), | 
					
						
							| 
									
										
										
										
											2023-12-11 16:29:41 -05:00
										 |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     writer = get_writer() | 
					
						
							|  |  |  |     runner = LocalRunner( | 
					
						
							|  |  |  |         processor_config=ProcessorConfig( | 
					
						
							|  |  |  |             verbose=True, | 
					
						
							|  |  |  |             output_dir="local-output-to-mongodb", | 
					
						
							|  |  |  |             num_processes=2, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         connector_config=SimpleLocalConfig( | 
					
						
							|  |  |  |             input_path="example-docs/book-war-and-peace-1225p.txt", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         read_config=ReadConfig(), | 
					
						
							|  |  |  |         partition_config=PartitionConfig(), | 
					
						
							|  |  |  |         chunking_config=ChunkingConfig(chunk_elements=True), | 
					
						
							|  |  |  |         embedding_config=EmbeddingConfig( | 
					
						
							|  |  |  |             provider="langchain-huggingface", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         writer=writer, | 
					
						
							|  |  |  |         writer_kwargs={}, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     runner.run() |