mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-26 07:19:23 +00:00 
			
		
		
		
	Feat: Markdown add image (#7124)
### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/6984 1. Markdown parser supports get pictures 2. For Native, when handling Markdown, it will handle images 3. improve merge and ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
		
							parent
							
								
									fef44a71c5
								
							
						
					
					
						commit
						1662c7eda3
					
				| @ -22,7 +22,7 @@ from timeit import default_timer as timer | |||||||
| 
 | 
 | ||||||
| from docx import Document | from docx import Document | ||||||
| from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError | from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError | ||||||
| from markdown import markdown | import markdown  | ||||||
| from PIL import Image | from PIL import Image | ||||||
| from tika import parser | from tika import parser | ||||||
| 
 | 
 | ||||||
| @ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle | |||||||
| from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | ||||||
| from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper | from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper | ||||||
| from deepdoc.parser.pdf_parser import PlainParser, VisionParser | from deepdoc.parser.pdf_parser import PlainParser, VisionParser | ||||||
| from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table | from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table | ||||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -287,6 +287,41 @@ class Pdf(PdfParser): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Markdown(MarkdownParser): | class Markdown(MarkdownParser): | ||||||
|  |     def get_picture_urls(self, sections): | ||||||
|  |         if not sections: | ||||||
|  |             return [] | ||||||
|  |         if isinstance(sections, type("")): | ||||||
|  |             text = sections | ||||||
|  |         elif isinstance(sections[0], type("")): | ||||||
|  |             text = sections[0] | ||||||
|  |         else: | ||||||
|  |             return [] | ||||||
|  |          | ||||||
|  |         from bs4 import BeautifulSoup | ||||||
|  |         md = markdown.Markdown() | ||||||
|  |         html_content = md.convert(text) | ||||||
|  |         soup = BeautifulSoup(html_content, 'html.parser') | ||||||
|  |         html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] | ||||||
|  |         return html_images | ||||||
|  |      | ||||||
|  |     def get_pictures(self, text): | ||||||
|  |         """Download and open all images from markdown text.""" | ||||||
|  |         import requests | ||||||
|  |         image_urls = self.get_picture_urls(text) | ||||||
|  |         images = [] | ||||||
|  |         # Find all image URLs in text | ||||||
|  |         for url in image_urls: | ||||||
|  |             try: | ||||||
|  |                 response = requests.get(url, stream=True, timeout=30) | ||||||
|  |                 if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'): | ||||||
|  |                     img = Image.open(BytesIO(response.content)).convert('RGB') | ||||||
|  |                     images.append(img) | ||||||
|  |             except Exception as e: | ||||||
|  |                 logging.error(f"Failed to download/open image from {url}: {e}") | ||||||
|  |                 continue | ||||||
|  |                      | ||||||
|  |         return images if images else None | ||||||
|  | 
 | ||||||
|     def __call__(self, filename, binary=None): |     def __call__(self, filename, binary=None): | ||||||
|         if binary: |         if binary: | ||||||
|             encoding = find_codec(binary) |             encoding = find_codec(binary) | ||||||
| @ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||||||
|     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) |     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | ||||||
|     res = [] |     res = [] | ||||||
|     pdf_parser = None |     pdf_parser = None | ||||||
|  |     section_images = None | ||||||
|     if re.search(r"\.docx$", filename, re.IGNORECASE): |     if re.search(r"\.docx$", filename, re.IGNORECASE): | ||||||
|         callback(0.1, "Start to parse.") |         callback(0.1, "Start to parse.") | ||||||
| 
 | 
 | ||||||
| @ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||||||
|         if kwargs.get("section_only", False): |         if kwargs.get("section_only", False): | ||||||
|             return chunks |             return chunks | ||||||
| 
 | 
 | ||||||
|         res.extend(tokenize_chunks_docx(chunks, doc, is_english, images)) |         res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) | ||||||
|         logging.info("naive_merge({}): {}".format(filename, timer() - st)) |         logging.info("naive_merge({}): {}".format(filename, timer() - st)) | ||||||
|         return res |         return res | ||||||
| 
 | 
 | ||||||
| @ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||||||
| 
 | 
 | ||||||
|     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): |     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | ||||||
|         callback(0.1, "Start to parse.") |         callback(0.1, "Start to parse.") | ||||||
|         sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) |         markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) | ||||||
|  |         sections, tables = markdown_parser(filename, binary) | ||||||
|  |          | ||||||
|  |         # Process images for each section | ||||||
|  |         section_images = [] | ||||||
|  |         for section_text, _ in sections: | ||||||
|  |             images = markdown_parser.get_pictures(section_text) if section_text else None | ||||||
|  |             if images: | ||||||
|  |                 # If multiple images found, combine them using concat_img | ||||||
|  |                 combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] | ||||||
|  |                 section_images.append(combined_image) | ||||||
|  |             else: | ||||||
|  |                 section_images.append(None) | ||||||
|  |                  | ||||||
|         res = tokenize_table(tables, doc, is_english) |         res = tokenize_table(tables, doc, is_english) | ||||||
|         callback(0.8, "Finish parsing.") |         callback(0.8, "Finish parsing.") | ||||||
| 
 | 
 | ||||||
| @ -467,6 +516,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||||||
|             "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") |             "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") | ||||||
| 
 | 
 | ||||||
|     st = timer() |     st = timer() | ||||||
|  |     if section_images: | ||||||
|  |         # if all images are None, set section_images to None | ||||||
|  |         if all(image is None for image in section_images): | ||||||
|  |             section_images = None | ||||||
|  | 
 | ||||||
|  |     if section_images: | ||||||
|  |         chunks, images = naive_merge_with_images(sections, section_images, | ||||||
|  |                                         int(parser_config.get( | ||||||
|  |                                             "chunk_token_num", 128)), parser_config.get( | ||||||
|  |                                             "delimiter", "\n!?。;!?")) | ||||||
|  |         if kwargs.get("section_only", False): | ||||||
|  |             return chunks | ||||||
|  |          | ||||||
|  |         res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) | ||||||
|  |     else: | ||||||
|         chunks = naive_merge( |         chunks = naive_merge( | ||||||
|             sections, int(parser_config.get( |             sections, int(parser_config.get( | ||||||
|                 "chunk_token_num", 128)), parser_config.get( |                 "chunk_token_num", 128)), parser_config.get( | ||||||
| @ -475,6 +539,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||||||
|             return chunks |             return chunks | ||||||
| 
 | 
 | ||||||
|         res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) |         res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) | ||||||
|  |      | ||||||
|     logging.info("naive_merge({}): {}".format(filename, timer() - st)) |     logging.info("naive_merge({}): {}".format(filename, timer() - st)) | ||||||
|     return res |     return res | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): | |||||||
|         res.append(d) |         res.append(d) | ||||||
|     return res |     return res | ||||||
| 
 | 
 | ||||||
| 
 | def tokenize_chunks_with_images(chunks, doc, eng, images): | ||||||
| def tokenize_chunks_docx(chunks, doc, eng, images): |  | ||||||
|     res = [] |     res = [] | ||||||
|     # wrap up as es documents |     # wrap up as es documents | ||||||
|     for ck, image in zip(chunks, images): |     for ck, image in zip(chunks, images): | ||||||
| @ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images): | |||||||
|         res.append(d) |         res.append(d) | ||||||
|     return res |     return res | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| def tokenize_table(tbls, doc, eng, batch_size=10): | def tokenize_table(tbls, doc, eng, batch_size=10): | ||||||
|     res = [] |     res = [] | ||||||
|     # add tables |     # add tables | ||||||
| @ -541,6 +539,45 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): | |||||||
|     return cks |     return cks | ||||||
|      |      | ||||||
| 
 | 
 | ||||||
|  | def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"): | ||||||
|  |     if not texts or len(texts) != len(images): | ||||||
|  |         return [], [] | ||||||
|  |     # Enuser texts is str not tuple, if it is tuple, convert to str (get the first item) | ||||||
|  |     if isinstance(texts[0], tuple): | ||||||
|  |         texts = [t[0] for t in texts] | ||||||
|  |     cks = [""] | ||||||
|  |     result_images = [None] | ||||||
|  |     tk_nums = [0] | ||||||
|  | 
 | ||||||
|  |     def add_chunk(t, image, pos=""): | ||||||
|  |         nonlocal cks, result_images, tk_nums, delimiter | ||||||
|  |         tnum = num_tokens_from_string(t) | ||||||
|  |         if not pos: | ||||||
|  |             pos = "" | ||||||
|  |         if tnum < 8: | ||||||
|  |             pos = "" | ||||||
|  |         # Ensure that the length of the merged chunk does not exceed chunk_token_num | ||||||
|  |         if tk_nums[-1] > chunk_token_num: | ||||||
|  |             if t.find(pos) < 0: | ||||||
|  |                 t += pos | ||||||
|  |             cks.append(t) | ||||||
|  |             result_images.append(image) | ||||||
|  |             tk_nums.append(tnum) | ||||||
|  |         else: | ||||||
|  |             if cks[-1].find(pos) < 0: | ||||||
|  |                 t += pos | ||||||
|  |             cks[-1] += t | ||||||
|  |             if result_images[-1] is None: | ||||||
|  |                 result_images[-1] = image | ||||||
|  |             else: | ||||||
|  |                 result_images[-1] = concat_img(result_images[-1], image) | ||||||
|  |             tk_nums[-1] += tnum | ||||||
|  | 
 | ||||||
|  |     for text, image in zip(texts, images): | ||||||
|  |         add_chunk(text, image) | ||||||
|  | 
 | ||||||
|  |     return cks, result_images | ||||||
|  | 
 | ||||||
| def docx_question_level(p, bull=-1): | def docx_question_level(p, bull=-1): | ||||||
|     txt = re.sub(r"\u3000", " ", p.text).strip() |     txt = re.sub(r"\u3000", " ", p.text).strip() | ||||||
|     if p.style.name.startswith('Heading'): |     if p.style.name.startswith('Heading'): | ||||||
|  | |||||||
| @ -103,7 +103,7 @@ export interface IChunk { | |||||||
|   content_with_weight: string; |   content_with_weight: string; | ||||||
|   doc_id: string; |   doc_id: string; | ||||||
|   doc_name: string; |   doc_name: string; | ||||||
|   img_id: string; |   image_id: string; | ||||||
|   important_kwd?: string[]; |   important_kwd?: string[]; | ||||||
|   question_kwd?: string[]; // keywords
 |   question_kwd?: string[]; // keywords
 | ||||||
|   tag_kwd?: string[]; |   tag_kwd?: string[]; | ||||||
|  | |||||||
| @ -64,14 +64,14 @@ const ChunkCard = ({ | |||||||
|     > |     > | ||||||
|       <Flex gap={'middle'} justify={'space-between'}> |       <Flex gap={'middle'} justify={'space-between'}> | ||||||
|         <Checkbox onChange={handleCheck} checked={checked}></Checkbox> |         <Checkbox onChange={handleCheck} checked={checked}></Checkbox> | ||||||
|         {item.img_id && ( |         {item.image_id && ( | ||||||
|           <Popover |           <Popover | ||||||
|             placement="right" |             placement="right" | ||||||
|             content={ |             content={ | ||||||
|               <Image id={item.img_id} className={styles.imagePreview}></Image> |               <Image id={item.image_id} className={styles.imagePreview}></Image> | ||||||
|             } |             } | ||||||
|           > |           > | ||||||
|             <Image id={item.img_id} className={styles.image}></Image> |             <Image id={item.image_id} className={styles.image}></Image> | ||||||
|           </Popover> |           </Popover> | ||||||
|         )} |         )} | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Stephen Hu
						Stephen Hu