mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-25 06:48:54 +00:00 
			
		
		
		
	Fix: Embedding err when docx contains unsupported images (#1720)
### What problem does this PR solve? Fix the problem of not being able to embedding when docx document contains unsupported images. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
		
							parent
							
								
									5e19423d82
								
							
						
					
					
						commit
						a973b9e01f
					
				| @ -23,7 +23,7 @@ from rag.utils import num_tokens_from_string | ||||
| from PIL import Image | ||||
| from functools import reduce | ||||
| from markdown import markdown | ||||
| 
 | ||||
| from docx.image.exceptions import UnrecognizedImageError | ||||
| 
 | ||||
| class Docx(DocxParser): | ||||
|     def __init__(self): | ||||
| @ -36,9 +36,16 @@ class Docx(DocxParser): | ||||
|         img = img[0] | ||||
|         embed = img.xpath('.//a:blip/@r:embed')[0] | ||||
|         related_part = document.part.related_parts[embed] | ||||
|         image = related_part.image | ||||
|         image = Image.open(BytesIO(image.blob)).convert('RGB') | ||||
|         return image | ||||
|         try: | ||||
|             image_blob = related_part.image.blob | ||||
|         except UnrecognizedImageError: | ||||
|             print("Unrecognized image format. Skipping image.") | ||||
|             return None | ||||
|         try: | ||||
|             image = Image.open(BytesIO(image_blob)).convert('RGB') | ||||
|             return image | ||||
|         except Exception as e: | ||||
|             return None | ||||
| 
 | ||||
|     def __clean(self, line): | ||||
|         line = re.sub(r"\u3000", " ", line).strip() | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Yuhao Tsui
						Yuhao Tsui