mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-11-26 06:57:27 +00:00
Fix: fixed context loss caused by separating markdown tables from original text (#8844)
### What problem does this PR solve? Fix context loss caused by separating markdown tables from original text. #6871, #8804. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
c08ed28f09
commit
51a8604dcb
@ -17,13 +17,33 @@
|
||||
|
||||
import re
|
||||
|
||||
from markdown import markdown
|
||||
|
||||
class RAGFlowMarkdownParser:
|
||||
def __init__(self, chunk_token_num=128):
|
||||
self.chunk_token_num = int(chunk_token_num)
|
||||
|
||||
def extract_tables_and_remainder(self, markdown_text):
|
||||
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
||||
tables = []
|
||||
remainder = markdown_text
|
||||
working_text = markdown_text
|
||||
|
||||
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
table_list.append(raw_table)
|
||||
if separate_tables:
|
||||
# Skip this match (i.e., remove it)
|
||||
new_text += working_text[last_end:match.start()] + "\n\n"
|
||||
else:
|
||||
# Replace with rendered HTML
|
||||
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
|
||||
new_text += working_text[last_end:match.start()] + html_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
return new_text
|
||||
|
||||
if "|" in markdown_text: # for optimize performance
|
||||
# Standard Markdown table
|
||||
border_table_pattern = re.compile(
|
||||
@ -33,9 +53,7 @@ class RAGFlowMarkdownParser:
|
||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||
(?:\|.*?\|.*?\|.*?\n)+
|
||||
''', re.VERBOSE)
|
||||
border_tables = border_table_pattern.findall(markdown_text)
|
||||
tables.extend(border_tables)
|
||||
remainder = border_table_pattern.sub('', remainder)
|
||||
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||
|
||||
# Borderless Markdown table
|
||||
no_border_table_pattern = re.compile(
|
||||
@ -45,11 +63,9 @@ class RAGFlowMarkdownParser:
|
||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||
(?:\S.*?\|.*?\n)+
|
||||
''', re.VERBOSE)
|
||||
no_border_tables = no_border_table_pattern.findall(remainder)
|
||||
tables.extend(no_border_tables)
|
||||
remainder = no_border_table_pattern.sub('', remainder)
|
||||
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||
|
||||
if "<table>" in remainder.lower(): # for optimize performance
|
||||
if "<table>" in working_text.lower(): # for optimize performance
|
||||
#HTML table extraction - handle possible html/body wrapper tags
|
||||
html_table_pattern = re.compile(
|
||||
r'''
|
||||
@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
|
||||
''',
|
||||
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
html_tables = html_table_pattern.findall(remainder)
|
||||
tables.extend(html_tables)
|
||||
remainder = html_table_pattern.sub('', remainder)
|
||||
def replace_html_tables():
|
||||
nonlocal working_text
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for match in html_table_pattern.finditer(working_text):
|
||||
raw_table = match.group()
|
||||
tables.append(raw_table)
|
||||
if separate_tables:
|
||||
new_text += working_text[last_end:match.start()] + "\n\n"
|
||||
else:
|
||||
new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
|
||||
last_end = match.end()
|
||||
new_text += working_text[last_end:]
|
||||
working_text = new_text
|
||||
|
||||
return remainder, tables
|
||||
replace_html_tables()
|
||||
|
||||
return working_text, tables
|
||||
|
||||
@ -323,14 +323,14 @@ class Markdown(MarkdownParser):
|
||||
|
||||
return images if images else None
|
||||
|
||||
def __call__(self, filename, binary=None):
|
||||
def __call__(self, filename, binary=None, separate_tables=True):
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||
sections = []
|
||||
tbls = []
|
||||
for sec in remainder.split("\n"):
|
||||
@ -465,7 +465,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||
sections, tables = markdown_parser(filename, binary)
|
||||
sections, tables = markdown_parser(filename, binary, separate_tables=False)
|
||||
|
||||
# Process images for each section
|
||||
section_images = []
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user