mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-30 09:20:01 +00:00 
			
		
		
		
	 a95c1d45f0
			
		
	
	
		a95c1d45f0
		
			
		
	
	
	
	
		
			
			### What problem does this PR solve? Support extracting table for markdown file in general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
		
			
				
	
	
		
			45 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			45 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # -*- coding: utf-8 -*-
 | |
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | |
| #  you may not use this file except in compliance with the License.
 | |
| #  You may obtain a copy of the License at
 | |
| #
 | |
| #      http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| #  Unless required by applicable law or agreed to in writing, software
 | |
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | |
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| #  See the License for the specific language governing permissions and
 | |
| #  limitations under the License.
 | |
| #
 | |
| import re
 | |
| 
 | |
| class RAGFlowMarkdownParser:
 | |
|     def __init__(self, chunk_token_num=128):
 | |
|         self.chunk_token_num = int(chunk_token_num)
 | |
| 
 | |
|     def extract_tables_and_remainder(self, markdown_text):
 | |
|         # Standard Markdown table
 | |
|         table_pattern = re.compile(
 | |
|             r'''
 | |
|             (?:\n|^)                     
 | |
|             (?:\|.*?\|.*?\|.*?\n)        
 | |
|             (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
 | |
|             (?:\|.*?\|.*?\|.*?\n)+
 | |
|             ''', re.VERBOSE)
 | |
|         tables = table_pattern.findall(markdown_text)
 | |
|         remainder = table_pattern.sub('', markdown_text)
 | |
| 
 | |
|         # Borderless Markdown table
 | |
|         no_border_table_pattern = re.compile(
 | |
|             r'''
 | |
|             (?:\n|^)                 
 | |
|             (?:\S.*?\|.*?\n)
 | |
|             (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
 | |
|             (?:\S.*?\|.*?\n)+
 | |
|             ''', re.VERBOSE)
 | |
|         no_border_tables = no_border_table_pattern.findall(remainder)
 | |
|         tables.extend(no_border_tables)
 | |
|         remainder = no_border_table_pattern.sub('', remainder)
 | |
| 
 | |
|         return remainder, tables
 |