diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index 193e5b960..1db8557a6 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -17,39 +17,55 @@
import re
+from markdown import markdown
+
class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num)
- def extract_tables_and_remainder(self, markdown_text):
+ def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
tables = []
- remainder = markdown_text
+ working_text = markdown_text
+
+ def replace_tables_with_rendered_html(pattern, table_list, render=True):
+ new_text = ""
+ last_end = 0
+ for match in pattern.finditer(working_text):
+ raw_table = match.group()
+ table_list.append(raw_table)
+ if separate_tables:
+ # Skip this match (i.e., remove it)
+ new_text += working_text[last_end:match.start()] + "\n\n"
+ else:
+ # Replace with rendered HTML
+ html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
+ new_text += working_text[last_end:match.start()] + html_table + "\n\n"
+ last_end = match.end()
+ new_text += working_text[last_end:]
+ return new_text
+
if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
r'''
- (?:\n|^)
- (?:\|.*?\|.*?\|.*?\n)
- (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
+ (?:\n|^)
+ (?:\|.*?\|.*?\|.*?\n)
+ (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE)
- border_tables = border_table_pattern.findall(markdown_text)
- tables.extend(border_tables)
- remainder = border_table_pattern.sub('', remainder)
+ working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
# Borderless Markdown table
no_border_table_pattern = re.compile(
r'''
- (?:\n|^)
+ (?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
- no_border_tables = no_border_table_pattern.findall(remainder)
- tables.extend(no_border_tables)
- remainder = no_border_table_pattern.sub('', remainder)
+ working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
- if "
" in remainder.lower(): # for optimize performance
+ if "" in working_text.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r'''
@@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
''',
re.VERBOSE | re.DOTALL | re.IGNORECASE
)
- html_tables = html_table_pattern.findall(remainder)
- tables.extend(html_tables)
- remainder = html_table_pattern.sub('', remainder)
+ def replace_html_tables():
+ nonlocal working_text
+ new_text = ""
+ last_end = 0
+ for match in html_table_pattern.finditer(working_text):
+ raw_table = match.group()
+ tables.append(raw_table)
+ if separate_tables:
+ new_text += working_text[last_end:match.start()] + "\n\n"
+ else:
+ new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
+ last_end = match.end()
+ new_text += working_text[last_end:]
+ working_text = new_text
- return remainder, tables
+ replace_html_tables()
+
+ return working_text, tables
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 1fd661b09..db1acc0eb 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -22,7 +22,7 @@ from timeit import default_timer as timer
from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
-from markdown import markdown
+from markdown import markdown
from PIL import Image
from tika import parser
@@ -76,15 +76,15 @@ class Docx(DocxParser):
"""Get the hierarchical title structure before the table"""
import re
from docx.text.paragraph import Paragraph
-
+
titles = []
blocks = []
-
+
# Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name:
doc_name = "Untitled Document"
-
+
# Collect all document blocks while maintaining document order
try:
# Iterate through all paragraphs and tables in document order
@@ -97,7 +97,7 @@ class Docx(DocxParser):
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
-
+
# Find the target table position
target_table_pos = -1
table_count = 0
@@ -107,20 +107,20 @@ class Docx(DocxParser):
target_table_pos = pos
break
table_count += 1
-
+
if target_table_pos == -1:
return "" # Target table not found
-
+
# Find the nearest heading paragraph in reverse order
nearest_title = None
for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
-
+
if block_type != 'p':
continue
-
+
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
@@ -133,12 +133,12 @@ class Docx(DocxParser):
break
except Exception as e:
logging.error(f"Error parsing heading level: {e}")
-
+
if nearest_title:
# Add current title
titles.append(nearest_title)
current_level = nearest_title[0]
-
+
# Find all parent headings, allowing cross-level search
while current_level > 1:
found = False
@@ -146,17 +146,17 @@ class Docx(DocxParser):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
-
+
if block_type != 'p':
continue
-
+
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
# Find any heading with a higher level
- if level < current_level:
+ if level < current_level:
title_text = block.text.strip()
if title_text: # Avoid empty titles
titles.append((level, title_text))
@@ -165,16 +165,16 @@ class Docx(DocxParser):
break
except Exception as e:
logging.error(f"Error parsing parent heading: {e}")
-
+
if not found: # Break if no parent heading is found
break
-
+
# Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy)
-
+
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
@@ -298,13 +298,13 @@ class Markdown(MarkdownParser):
text = sections[0]
else:
return []
-
+
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images
-
+
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
@@ -320,17 +320,17 @@ class Markdown(MarkdownParser):
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
-
+
return images if images else None
- def __call__(self, filename, binary=None):
+ def __call__(self, filename, binary=None, separate_tables=True):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
txt = f.read()
- remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
+ remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
sections = []
tbls = []
for sec in remainder.split("\n"):
@@ -465,8 +465,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
- sections, tables = markdown_parser(filename, binary)
-
+ sections, tables = markdown_parser(filename, binary, separate_tables=False)
+
# Process images for each section
section_images = []
for section_text, _ in sections:
@@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
section_images.append(combined_image)
else:
section_images.append(None)
-
+
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@@ -524,7 +524,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
-
+
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
@@ -535,7 +535,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
-
+
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res