mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-26 00:14:53 +00:00 
			
		
		
		
	feat(ingest): add non-random sampling for mongo (#2778)
This commit is contained in:
		
							parent
							
								
									c05459b446
								
							
						
					
					
						commit
						09bbcea0a8
					
				| @ -574,6 +574,7 @@ Extracts: | |||||||
| - List of collections in each database and infers schemas for each collection | - List of collections in each database and infers schemas for each collection | ||||||
| 
 | 
 | ||||||
| By default, schema inference samples 1,000 documents from each collection. Setting `schemaSamplingSize: null` will scan the entire collection. | By default, schema inference samples 1,000 documents from each collection. Setting `schemaSamplingSize: null` will scan the entire collection. | ||||||
|  | Moreover, setting `useRandomSampling: False` will sample the first documents found without random selection, which may be faster for large collections. | ||||||
| 
 | 
 | ||||||
| Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. | Note that `schemaSamplingSize` has no effect if `enableSchemaInference: False` is set. | ||||||
| 
 | 
 | ||||||
| @ -593,6 +594,7 @@ source: | |||||||
|     collection_pattern: {} |     collection_pattern: {} | ||||||
|     enableSchemaInference: True |     enableSchemaInference: True | ||||||
|     schemaSamplingSize: 1000 |     schemaSamplingSize: 1000 | ||||||
|  |     useRandomSampling: True # whether to randomly sample docs for schema or just use the first ones, True by default | ||||||
|     # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above |     # database_pattern/collection_pattern are similar to schema_pattern/table_pattern from above | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -54,6 +54,7 @@ class MongoDBConfig(ConfigModel): | |||||||
|     options: dict = {} |     options: dict = {} | ||||||
|     enableSchemaInference: bool = True |     enableSchemaInference: bool = True | ||||||
|     schemaSamplingSize: Optional[PositiveInt] = 1000 |     schemaSamplingSize: Optional[PositiveInt] = 1000 | ||||||
|  |     useRandomSampling: bool = True | ||||||
|     env: str = DEFAULT_ENV |     env: str = DEFAULT_ENV | ||||||
| 
 | 
 | ||||||
|     database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() |     database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() | ||||||
| @ -282,6 +283,7 @@ def construct_schema( | |||||||
| def construct_schema_pymongo( | def construct_schema_pymongo( | ||||||
|     collection: pymongo.collection.Collection, |     collection: pymongo.collection.Collection, | ||||||
|     delimiter: str, |     delimiter: str, | ||||||
|  |     use_random_sampling: bool, | ||||||
|     sample_size: Optional[int] = None, |     sample_size: Optional[int] = None, | ||||||
| ) -> Dict[Tuple[str, ...], SchemaDescription]: | ) -> Dict[Tuple[str, ...], SchemaDescription]: | ||||||
|     """ |     """ | ||||||
| @ -302,10 +304,15 @@ def construct_schema_pymongo( | |||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     if sample_size: |     if sample_size: | ||||||
|  |         if use_random_sampling: | ||||||
|             # get sample documents in collection |             # get sample documents in collection | ||||||
|             documents = collection.aggregate( |             documents = collection.aggregate( | ||||||
|                 [{"$sample": {"size": sample_size}}], allowDiskUse=True |                 [{"$sample": {"size": sample_size}}], allowDiskUse=True | ||||||
|             ) |             ) | ||||||
|  |         else: | ||||||
|  |             documents = collection.aggregate( | ||||||
|  |                 [{"$limit": sample_size}], allowDiskUse=True | ||||||
|  |             ) | ||||||
|     else: |     else: | ||||||
|         # if sample_size is not provided, just take all items in the collection |         # if sample_size is not provided, just take all items in the collection | ||||||
|         documents = collection.find({}) |         documents = collection.find({}) | ||||||
| @ -434,6 +441,7 @@ class MongoDBSource(Source): | |||||||
|                     collection_schema = construct_schema_pymongo( |                     collection_schema = construct_schema_pymongo( | ||||||
|                         database[collection_name], |                         database[collection_name], | ||||||
|                         delimiter=".", |                         delimiter=".", | ||||||
|  |                         use_random_sampling=self.config.useRandomSampling, | ||||||
|                         sample_size=self.config.schemaSamplingSize, |                         sample_size=self.config.schemaSamplingSize, | ||||||
|                     ) |                     ) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Kevin Hu
						Kevin Hu