2024-06-27 14:38:35 +08:00
|
|
|
# -*- coding: utf-8 -*-
|
2025-01-21 20:52:28 +08:00
|
|
|
#
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
#
|
2024-06-27 14:38:35 +08:00
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
2025-01-21 20:52:28 +08:00
|
|
|
|
2024-06-27 14:38:35 +08:00
|
|
|
import re
|
|
|
|
|
|
|
|
class RAGFlowMarkdownParser:
|
|
|
|
def __init__(self, chunk_token_num=128):
|
|
|
|
self.chunk_token_num = int(chunk_token_num)
|
|
|
|
|
|
|
|
def extract_tables_and_remainder(self, markdown_text):
|
2025-03-07 17:02:35 +08:00
|
|
|
tables = []
|
|
|
|
remainder = markdown_text
|
|
|
|
if "|" in markdown_text: # for optimize performance
|
|
|
|
# Standard Markdown table
|
|
|
|
border_table_pattern = re.compile(
|
|
|
|
r'''
|
|
|
|
(?:\n|^)
|
|
|
|
(?:\|.*?\|.*?\|.*?\n)
|
|
|
|
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
|
|
|
(?:\|.*?\|.*?\|.*?\n)+
|
2024-06-27 14:38:35 +08:00
|
|
|
''', re.VERBOSE)
|
2025-03-07 17:02:35 +08:00
|
|
|
border_tables = border_table_pattern.findall(markdown_text)
|
|
|
|
tables.extend(border_tables)
|
|
|
|
remainder = border_table_pattern.sub('', remainder)
|
|
|
|
|
|
|
|
# Borderless Markdown table
|
|
|
|
no_border_table_pattern = re.compile(
|
|
|
|
r'''
|
|
|
|
(?:\n|^)
|
|
|
|
(?:\S.*?\|.*?\n)
|
|
|
|
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
|
|
|
(?:\S.*?\|.*?\n)+
|
|
|
|
''', re.VERBOSE)
|
|
|
|
no_border_tables = no_border_table_pattern.findall(remainder)
|
|
|
|
tables.extend(no_border_tables)
|
|
|
|
remainder = no_border_table_pattern.sub('', remainder)
|
2024-06-27 14:38:35 +08:00
|
|
|
|
2025-03-07 17:02:35 +08:00
|
|
|
if "<table>" in remainder.lower(): # for optimize performance
|
|
|
|
#HTML table extraction - handle possible html/body wrapper tags
|
|
|
|
html_table_pattern = re.compile(
|
2024-06-27 14:38:35 +08:00
|
|
|
r'''
|
2025-03-07 17:02:35 +08:00
|
|
|
(?:\n|^)
|
|
|
|
\s*
|
|
|
|
(?:
|
|
|
|
# case1: <html><body><table>...</table></body></html>
|
|
|
|
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
|
|
|
|
|
|
|
|
# case2: <body><table>...</table></body>
|
|
|
|
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
|
|
|
|
|
|
|
|
# case3: only<table>...</table>
|
|
|
|
(?:<table[^>]*>.*?</table>)
|
|
|
|
)
|
|
|
|
\s*
|
|
|
|
(?=\n|$)
|
|
|
|
''',
|
|
|
|
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
|
|
|
)
|
|
|
|
html_tables = html_table_pattern.findall(remainder)
|
|
|
|
tables.extend(html_tables)
|
|
|
|
remainder = html_table_pattern.sub('', remainder)
|
2024-06-27 14:38:35 +08:00
|
|
|
|
|
|
|
return remainder, tables
|