mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-26 15:29:09 +00:00 
			
		
		
		
	Feat: Markdown add image (#7124)
### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/6984 1. Markdown parser supports get pictures 2. For Native, when handling Markdown, it will handle images 3. improve merge and ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
		
							parent
							
								
									fef44a71c5
								
							
						
					
					
						commit
						1662c7eda3
					
				| @ -22,7 +22,7 @@ from timeit import default_timer as timer | ||||
| 
 | ||||
| from docx import Document | ||||
| from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError | ||||
| from markdown import markdown | ||||
| import markdown  | ||||
| from PIL import Image | ||||
| from tika import parser | ||||
| 
 | ||||
| @ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle | ||||
| from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | ||||
| from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper | ||||
| from deepdoc.parser.pdf_parser import PlainParser, VisionParser | ||||
| from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table | ||||
| from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table | ||||
| from rag.utils import num_tokens_from_string | ||||
| 
 | ||||
| 
 | ||||
| @ -287,6 +287,41 @@ class Pdf(PdfParser): | ||||
| 
 | ||||
| 
 | ||||
| class Markdown(MarkdownParser): | ||||
|     def get_picture_urls(self, sections): | ||||
|         if not sections: | ||||
|             return [] | ||||
|         if isinstance(sections, type("")): | ||||
|             text = sections | ||||
|         elif isinstance(sections[0], type("")): | ||||
|             text = sections[0] | ||||
|         else: | ||||
|             return [] | ||||
|          | ||||
|         from bs4 import BeautifulSoup | ||||
|         md = markdown.Markdown() | ||||
|         html_content = md.convert(text) | ||||
|         soup = BeautifulSoup(html_content, 'html.parser') | ||||
|         html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] | ||||
|         return html_images | ||||
|      | ||||
|     def get_pictures(self, text): | ||||
|         """Download and open all images from markdown text.""" | ||||
|         import requests | ||||
|         image_urls = self.get_picture_urls(text) | ||||
|         images = [] | ||||
|         # Find all image URLs in text | ||||
|         for url in image_urls: | ||||
|             try: | ||||
|                 response = requests.get(url, stream=True, timeout=30) | ||||
|                 if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'): | ||||
|                     img = Image.open(BytesIO(response.content)).convert('RGB') | ||||
|                     images.append(img) | ||||
|             except Exception as e: | ||||
|                 logging.error(f"Failed to download/open image from {url}: {e}") | ||||
|                 continue | ||||
|                      | ||||
|         return images if images else None | ||||
| 
 | ||||
|     def __call__(self, filename, binary=None): | ||||
|         if binary: | ||||
|             encoding = find_codec(binary) | ||||
| @ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | ||||
|     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | ||||
|     res = [] | ||||
|     pdf_parser = None | ||||
|     section_images = None | ||||
|     if re.search(r"\.docx$", filename, re.IGNORECASE): | ||||
|         callback(0.1, "Start to parse.") | ||||
| 
 | ||||
| @ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | ||||
|         if kwargs.get("section_only", False): | ||||
|             return chunks | ||||
| 
 | ||||
|         res.extend(tokenize_chunks_docx(chunks, doc, is_english, images)) | ||||
|         res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) | ||||
|         logging.info("naive_merge({}): {}".format(filename, timer() - st)) | ||||
|         return res | ||||
| 
 | ||||
| @ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | ||||
| 
 | ||||
|     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | ||||
|         callback(0.1, "Start to parse.") | ||||
|         sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) | ||||
|         markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) | ||||
|         sections, tables = markdown_parser(filename, binary) | ||||
|          | ||||
|         # Process images for each section | ||||
|         section_images = [] | ||||
|         for section_text, _ in sections: | ||||
|             images = markdown_parser.get_pictures(section_text) if section_text else None | ||||
|             if images: | ||||
|                 # If multiple images found, combine them using concat_img | ||||
|                 combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] | ||||
|                 section_images.append(combined_image) | ||||
|             else: | ||||
|                 section_images.append(None) | ||||
|                  | ||||
|         res = tokenize_table(tables, doc, is_english) | ||||
|         callback(0.8, "Finish parsing.") | ||||
| 
 | ||||
| @ -467,14 +516,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | ||||
|             "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") | ||||
| 
 | ||||
|     st = timer() | ||||
|     chunks = naive_merge( | ||||
|         sections, int(parser_config.get( | ||||
|             "chunk_token_num", 128)), parser_config.get( | ||||
|             "delimiter", "\n!?。;!?")) | ||||
|     if kwargs.get("section_only", False): | ||||
|         return chunks | ||||
|     if section_images: | ||||
|         # if all images are None, set section_images to None | ||||
|         if all(image is None for image in section_images): | ||||
|             section_images = None | ||||
| 
 | ||||
|     res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) | ||||
|     if section_images: | ||||
|         chunks, images = naive_merge_with_images(sections, section_images, | ||||
|                                         int(parser_config.get( | ||||
|                                             "chunk_token_num", 128)), parser_config.get( | ||||
|                                             "delimiter", "\n!?。;!?")) | ||||
|         if kwargs.get("section_only", False): | ||||
|             return chunks | ||||
|          | ||||
|         res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) | ||||
|     else: | ||||
|         chunks = naive_merge( | ||||
|             sections, int(parser_config.get( | ||||
|                 "chunk_token_num", 128)), parser_config.get( | ||||
|                 "delimiter", "\n!?。;!?")) | ||||
|         if kwargs.get("section_only", False): | ||||
|             return chunks | ||||
| 
 | ||||
|         res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) | ||||
|      | ||||
|     logging.info("naive_merge({}): {}".format(filename, timer() - st)) | ||||
|     return res | ||||
| 
 | ||||
|  | ||||
| @ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): | ||||
|         res.append(d) | ||||
|     return res | ||||
| 
 | ||||
| 
 | ||||
| def tokenize_chunks_docx(chunks, doc, eng, images): | ||||
| def tokenize_chunks_with_images(chunks, doc, eng, images): | ||||
|     res = [] | ||||
|     # wrap up as es documents | ||||
|     for ck, image in zip(chunks, images): | ||||
| @ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images): | ||||
|         res.append(d) | ||||
|     return res | ||||
| 
 | ||||
| 
 | ||||
| def tokenize_table(tbls, doc, eng, batch_size=10): | ||||
|     res = [] | ||||
|     # add tables | ||||
| @ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): | ||||
|         add_chunk(sec, pos) | ||||
| 
 | ||||
|     return cks | ||||
|      | ||||
| 
 | ||||
| def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"): | ||||
|     if not texts or len(texts) != len(images): | ||||
|         return [], [] | ||||
|     # Enuser texts is str not tuple, if it is tuple, convert to str (get the first item) | ||||
|     if isinstance(texts[0], tuple): | ||||
|         texts = [t[0] for t in texts] | ||||
|     cks = [""] | ||||
|     result_images = [None] | ||||
|     tk_nums = [0] | ||||
| 
 | ||||
|     def add_chunk(t, image, pos=""): | ||||
|         nonlocal cks, result_images, tk_nums, delimiter | ||||
|         tnum = num_tokens_from_string(t) | ||||
|         if not pos: | ||||
|             pos = "" | ||||
|         if tnum < 8: | ||||
|             pos = "" | ||||
|         # Ensure that the length of the merged chunk does not exceed chunk_token_num | ||||
|         if tk_nums[-1] > chunk_token_num: | ||||
|             if t.find(pos) < 0: | ||||
|                 t += pos | ||||
|             cks.append(t) | ||||
|             result_images.append(image) | ||||
|             tk_nums.append(tnum) | ||||
|         else: | ||||
|             if cks[-1].find(pos) < 0: | ||||
|                 t += pos | ||||
|             cks[-1] += t | ||||
|             if result_images[-1] is None: | ||||
|                 result_images[-1] = image | ||||
|             else: | ||||
|                 result_images[-1] = concat_img(result_images[-1], image) | ||||
|             tk_nums[-1] += tnum | ||||
| 
 | ||||
|     for text, image in zip(texts, images): | ||||
|         add_chunk(text, image) | ||||
| 
 | ||||
|     return cks, result_images | ||||
| 
 | ||||
| def docx_question_level(p, bull=-1): | ||||
|     txt = re.sub(r"\u3000", " ", p.text).strip() | ||||
|  | ||||
| @ -103,7 +103,7 @@ export interface IChunk { | ||||
|   content_with_weight: string; | ||||
|   doc_id: string; | ||||
|   doc_name: string; | ||||
|   img_id: string; | ||||
|   image_id: string; | ||||
|   important_kwd?: string[]; | ||||
|   question_kwd?: string[]; // keywords
 | ||||
|   tag_kwd?: string[]; | ||||
|  | ||||
| @ -64,14 +64,14 @@ const ChunkCard = ({ | ||||
|     > | ||||
|       <Flex gap={'middle'} justify={'space-between'}> | ||||
|         <Checkbox onChange={handleCheck} checked={checked}></Checkbox> | ||||
|         {item.img_id && ( | ||||
|         {item.image_id && ( | ||||
|           <Popover | ||||
|             placement="right" | ||||
|             content={ | ||||
|               <Image id={item.img_id} className={styles.imagePreview}></Image> | ||||
|               <Image id={item.image_id} className={styles.imagePreview}></Image> | ||||
|             } | ||||
|           > | ||||
|             <Image id={item.img_id} className={styles.image}></Image> | ||||
|             <Image id={item.image_id} className={styles.image}></Image> | ||||
|           </Popover> | ||||
|         )} | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Stephen Hu
						Stephen Hu