mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	build: downgrade nltk version (#3527)
				
					
				
			This PR aims to roll back `nltk` to `3.8.1` which bumped to `3.8.2` in https://github.com/Unstructured-IO/unstructured/pull/3512 because `3.8.2` is no longer available in PyPI due to some issues(https://github.com/nltk/nltk/issues/3301)
This commit is contained in:
		
							parent
							
								
									9b778e270d
								
							
						
					
					
						commit
						d0211cc41f
					
				
							
								
								
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								CHANGELOG.md
									
									
									
									
									
								
							| @ -1,3 +1,14 @@ | |||||||
|  | ## 0.15.5-dev0 | ||||||
|  | 
 | ||||||
|  | ### Enhancements | ||||||
|  | 
 | ||||||
|  | ### Features | ||||||
|  | 
 | ||||||
|  | ### Fixes | ||||||
|  | 
 | ||||||
|  | * **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| ## 0.15.4 | ## 0.15.4 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
|  | |||||||
| @ -26,8 +26,7 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \ | |||||||
|   dnf -y groupremove "Development Tools" && \ |   dnf -y groupremove "Development Tools" && \ | ||||||
|   dnf clean all |   dnf clean all | ||||||
| 
 | 
 | ||||||
| RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ | RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" | ||||||
|   python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" |  | ||||||
| 
 | 
 | ||||||
| FROM deps as code | FROM deps as code | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -69,7 +69,7 @@ mypy-extensions==1.0.0 | |||||||
|     #   unstructured-client |     #   unstructured-client | ||||||
| nest-asyncio==1.6.0 | nest-asyncio==1.6.0 | ||||||
|     # via unstructured-client |     # via unstructured-client | ||||||
| nltk==3.8.2 | nltk==3.8.1 | ||||||
|     # via -r ./base.in |     # via -r ./base.in | ||||||
| numpy==1.26.4 | numpy==1.26.4 | ||||||
|     # via -r ./base.in |     # via -r ./base.in | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.15.4"  # pragma: no cover | __version__ = "0.15.5-dev0"  # pragma: no cover | ||||||
|  | |||||||
| @ -16,9 +16,9 @@ from nltk import word_tokenize as _word_tokenize | |||||||
| 
 | 
 | ||||||
| CACHE_MAX_SIZE: Final[int] = 128 | CACHE_MAX_SIZE: Final[int] = 128 | ||||||
| 
 | 
 | ||||||
| NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz" | NLTK_DATA_FILENAME = "nltk_data.tgz" | ||||||
| NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}" | NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}" | ||||||
| NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663" | NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # NOTE(robinson) - mimic default dir logic from NLTK | # NOTE(robinson) - mimic default dir logic from NLTK | ||||||
| @ -114,10 +114,10 @@ def _download_nltk_packages_if_not_present(): | |||||||
| 
 | 
 | ||||||
|     tagger_available = check_for_nltk_package( |     tagger_available = check_for_nltk_package( | ||||||
|         package_category="taggers", |         package_category="taggers", | ||||||
|         package_name="averaged_perceptron_tagger_eng", |         package_name="averaged_perceptron_tagger", | ||||||
|     ) |     ) | ||||||
|     tokenizer_available = check_for_nltk_package( |     tokenizer_available = check_for_nltk_package( | ||||||
|         package_category="tokenizers", package_name="punkt_tab" |         package_category="tokenizers", package_name="punkt" | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     if not (tokenizer_available and tagger_available): |     if not (tokenizer_available and tagger_available): | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Christine Straub
						Christine Straub