2025-01-21 20:52:28 +08:00
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
2024-08-15 09:17:36 +08:00
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
2025-01-21 20:52:28 +08:00
2024-11-14 17:13:48 +08:00
import logging
2025-03-13 18:48:32 +08:00
import re
from functools import reduce
2024-08-15 09:17:36 +08:00
from io import BytesIO
from timeit import default_timer as timer
2025-03-13 18:48:32 +08:00
from docx import Document
from docx . image . exceptions import InvalidImageStreamError , UnexpectedEndOfFileError , UnrecognizedImageError
2025-08-14 12:14:03 +08:00
from docx . opc . pkgreader import _SerializedRelationships , _SerializedRelationship
from docx . opc . oxml import parse_xml
2025-07-15 13:03:01 +08:00
from markdown import markdown
2025-03-13 18:48:32 +08:00
from PIL import Image
from tika import parser
2025-03-18 14:52:20 +08:00
from api . db import LLMType
from api . db . services . llm_service import LLMBundle
2025-03-13 18:48:32 +08:00
from deepdoc . parser import DocxParser , ExcelParser , HtmlParser , JsonParser , MarkdownParser , PdfParser , TxtParser
2025-06-18 09:41:09 +08:00
from deepdoc . parser . figure_parser import VisionFigureParser , vision_figure_parser_figure_data_wrapper
2025-03-20 10:48:38 +08:00
from deepdoc . parser . pdf_parser import PlainParser , VisionParser
2025-04-25 18:35:28 +08:00
from rag . nlp import concat_img , find_codec , naive_merge , naive_merge_with_images , naive_merge_docx , rag_tokenizer , tokenize_chunks , tokenize_chunks_with_images , tokenize_table
2024-08-15 09:17:36 +08:00
2024-09-30 16:59:39 +08:00
2024-08-15 09:17:36 +08:00
class Docx ( DocxParser ) :
def __init__ ( self ) :
pass
def get_picture ( self , document , paragraph ) :
img = paragraph . _element . xpath ( ' .//pic:pic ' )
if not img :
return None
img = img [ 0 ]
2025-04-07 12:33:34 +08:00
embed = img . xpath ( ' .//a:blip/@r:embed ' )
if not embed :
return None
embed = embed [ 0 ]
2024-08-15 09:17:36 +08:00
try :
2025-08-14 12:14:03 +08:00
related_part = document . part . related_parts [ embed ]
2024-08-15 09:17:36 +08:00
image_blob = related_part . image . blob
except UnrecognizedImageError :
2024-11-14 17:13:48 +08:00
logging . info ( " Unrecognized image format. Skipping image. " )
2024-08-15 09:17:36 +08:00
return None
2024-10-09 19:37:32 +08:00
except UnexpectedEndOfFileError :
2024-11-14 17:13:48 +08:00
logging . info ( " EOF was unexpectedly encountered while reading an image stream. Skipping image. " )
2024-10-09 19:37:32 +08:00
return None
except InvalidImageStreamError :
2024-11-14 17:13:48 +08:00
logging . info ( " The recognized image stream appears to be corrupted. Skipping image. " )
2024-10-09 19:37:32 +08:00
return None
2025-05-14 12:24:48 +08:00
except UnicodeDecodeError :
logging . info ( " The recognized image stream appears to be corrupted. Skipping image. " )
return None
2025-08-14 12:14:03 +08:00
except Exception :
logging . info ( " The recognized image stream appears to be corrupted. Skipping image. " )
return None
2024-08-15 09:17:36 +08:00
try :
image = Image . open ( BytesIO ( image_blob ) ) . convert ( ' RGB ' )
return image
2024-11-12 17:35:13 +08:00
except Exception :
2024-08-15 09:17:36 +08:00
return None
def __clean ( self , line ) :
line = re . sub ( r " \ u3000 " , " " , line ) . strip ( )
return line
2025-03-21 18:42:36 +08:00
def __get_nearest_title ( self , table_index , filename ) :
""" Get the hierarchical title structure before the table """
import re
from docx . text . paragraph import Paragraph
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
titles = [ ]
blocks = [ ]
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Get document name from filename parameter
doc_name = re . sub ( r " \ .[a-zA-Z]+$ " , " " , filename )
if not doc_name :
doc_name = " Untitled Document "
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Collect all document blocks while maintaining document order
try :
# Iterate through all paragraphs and tables in document order
for i , block in enumerate ( self . doc . _element . body ) :
if block . tag . endswith ( ' p ' ) : # Paragraph
p = Paragraph ( block , self . doc )
blocks . append ( ( ' p ' , i , p ) )
elif block . tag . endswith ( ' tbl ' ) : # Table
blocks . append ( ( ' t ' , i , None ) ) # Table object will be retrieved later
except Exception as e :
logging . error ( f " Error collecting blocks: { e } " )
return " "
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Find the target table position
target_table_pos = - 1
table_count = 0
for i , ( block_type , pos , _ ) in enumerate ( blocks ) :
if block_type == ' t ' :
if table_count == table_index :
target_table_pos = pos
break
table_count + = 1
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if target_table_pos == - 1 :
return " " # Target table not found
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Find the nearest heading paragraph in reverse order
nearest_title = None
for i in range ( len ( blocks ) - 1 , - 1 , - 1 ) :
block_type , pos , block = blocks [ i ]
if pos > = target_table_pos : # Skip blocks after the table
continue
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if block_type != ' p ' :
continue
2025-07-15 13:03:01 +08:00
2025-07-01 17:38:11 +08:00
if block . style and block . style . name and re . search ( r " Heading \ s*( \ d+) " , block . style . name , re . I ) :
2025-03-21 18:42:36 +08:00
try :
level_match = re . search ( r " ( \ d+) " , block . style . name )
if level_match :
level = int ( level_match . group ( 1 ) )
if level < = 7 : # Support up to 7 heading levels
title_text = block . text . strip ( )
if title_text : # Avoid empty titles
nearest_title = ( level , title_text )
break
except Exception as e :
logging . error ( f " Error parsing heading level: { e } " )
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if nearest_title :
# Add current title
titles . append ( nearest_title )
current_level = nearest_title [ 0 ]
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Find all parent headings, allowing cross-level search
while current_level > 1 :
found = False
for i in range ( len ( blocks ) - 1 , - 1 , - 1 ) :
block_type , pos , block = blocks [ i ]
if pos > = target_table_pos : # Skip blocks after the table
continue
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if block_type != ' p ' :
continue
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if block . style and re . search ( r " Heading \ s*( \ d+) " , block . style . name , re . I ) :
try :
level_match = re . search ( r " ( \ d+) " , block . style . name )
if level_match :
level = int ( level_match . group ( 1 ) )
# Find any heading with a higher level
2025-07-15 13:03:01 +08:00
if level < current_level :
2025-03-21 18:42:36 +08:00
title_text = block . text . strip ( )
if title_text : # Avoid empty titles
titles . append ( ( level , title_text ) )
current_level = level
found = True
break
except Exception as e :
logging . error ( f " Error parsing parent heading: { e } " )
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
if not found : # Break if no parent heading is found
break
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
# Sort by level (ascending, from highest to lowest)
titles . sort ( key = lambda x : x [ 0 ] )
# Organize titles (from highest to lowest)
hierarchy = [ doc_name ] + [ t [ 1 ] for t in titles ]
return " > " . join ( hierarchy )
2025-07-15 13:03:01 +08:00
2025-03-21 18:42:36 +08:00
return " "
2024-08-15 09:17:36 +08:00
def __call__ ( self , filename , binary = None , from_page = 0 , to_page = 100000 ) :
self . doc = Document (
filename ) if not binary else Document ( BytesIO ( binary ) )
pn = 0
lines = [ ]
last_image = None
for p in self . doc . paragraphs :
if pn > to_page :
break
if from_page < = pn < to_page :
if p . text . strip ( ) :
if p . style and p . style . name == ' Caption ' :
former_image = None
if lines and lines [ - 1 ] [ 1 ] and lines [ - 1 ] [ 2 ] != ' Caption ' :
former_image = lines [ - 1 ] [ 1 ] . pop ( )
elif last_image :
former_image = last_image
last_image = None
lines . append ( ( self . __clean ( p . text ) , [ former_image ] , p . style . name ) )
else :
current_image = self . get_picture ( self . doc , p )
image_list = [ current_image ]
if last_image :
image_list . insert ( 0 , last_image )
last_image = None
2024-09-18 16:09:22 +08:00
lines . append ( ( self . __clean ( p . text ) , image_list , p . style . name if p . style else " " ) )
2024-08-15 09:17:36 +08:00
else :
if current_image := self . get_picture ( self . doc , p ) :
if lines :
lines [ - 1 ] [ 1 ] . append ( current_image )
else :
last_image = current_image
for run in p . runs :
if ' lastRenderedPageBreak ' in run . _element . xml :
pn + = 1
continue
if ' w:br ' in run . _element . xml and ' type= " page " ' in run . _element . xml :
pn + = 1
new_line = [ ( line [ 0 ] , reduce ( concat_img , line [ 1 ] ) if line [ 1 ] else None ) for line in lines ]
tbls = [ ]
2025-03-21 18:42:36 +08:00
for i , tb in enumerate ( self . doc . tables ) :
title = self . __get_nearest_title ( i , filename )
2024-09-30 16:59:39 +08:00
html = " <table> "
2025-03-21 18:42:36 +08:00
if title :
html + = f " <caption>Table Location: { title } </caption> "
2024-08-15 09:17:36 +08:00
for r in tb . rows :
html + = " <tr> "
i = 0
Fix "no `tc` element at grid_offset" (#9375)
### What problem does this PR solve?
fix "no `tc` element at grid_offset", just log warning and ignore.
stacktrace:
```
Traceback (most recent call last):
File "/ragflow/rag/svr/task_executor.py", line 620, in handle_task
await do_handle_task(task)
File "/ragflow/rag/svr/task_executor.py", line 553, in do_handle_task
chunks = await build_chunks(task, progress_callback)
File "/ragflow/rag/svr/task_executor.py", line 257, in build_chunks
cks = await trio.to_thread.run_sync(lambda: chunker.chunk(task["name"], binary=binary, from_page=task["from_page"],
File "/ragflow/.venv/lib/python3.10/site-packages/trio/_threads.py", line 447, in to_thread_run_sync
return msg_from_thread.unwrap()
File "/ragflow/.venv/lib/python3.10/site-packages/outcome/_impl.py", line 213, in unwrap
raise captured_error
File "/ragflow/.venv/lib/python3.10/site-packages/trio/_threads.py", line 373, in do_release_then_return_result
return result.unwrap()
File "/ragflow/.venv/lib/python3.10/site-packages/outcome/_impl.py", line 213, in unwrap
raise captured_error
File "/ragflow/.venv/lib/python3.10/site-packages/trio/_threads.py", line 392, in worker_fn
ret = context.run(sync_fn, *args)
File "/ragflow/rag/svr/task_executor.py", line 257, in <lambda>
cks = await trio.to_thread.run_sync(lambda: chunker.chunk(task["name"], binary=binary, from_page=task["from_page"],
File "/ragflow/rag/app/naive.py", line 384, in chunk
sections, tables = Docx()(filename, binary)
File "/ragflow/rag/app/naive.py", line 230, in __call__
while i < len(r.cells):
File "/ragflow/.venv/lib/python3.10/site-packages/docx/table.py", line 438, in cells
return tuple(_iter_row_cells())
File "/ragflow/.venv/lib/python3.10/site-packages/docx/table.py", line 436, in _iter_row_cells
yield from iter_tc_cells(tc)
File "/ragflow/.venv/lib/python3.10/site-packages/docx/table.py", line 424, in iter_tc_cells
yield from iter_tc_cells(tc._tc_above) # pyright: ignore[reportPrivateUsage]
File "/ragflow/.venv/lib/python3.10/site-packages/docx/oxml/table.py", line 741, in _tc_above
return self._tr_above.tc_at_grid_offset(self.grid_offset)
File "/ragflow/.venv/lib/python3.10/site-packages/docx/oxml/table.py", line 98, in tc_at_grid_offset
raise ValueError(f"no `tc` element at grid_offset={grid_offset}")
ValueError: no `tc` element at grid_offset=10
```
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2025-08-11 17:13:10 +08:00
try :
while i < len ( r . cells ) :
span = 1
c = r . cells [ i ]
for j in range ( i + 1 , len ( r . cells ) ) :
if c . text == r . cells [ j ] . text :
span + = 1
i = j
else :
break
i + = 1
html + = f " <td> { c . text } </td> " if span == 1 else f " <td colspan= ' { span } ' > { c . text } </td> "
except Exception as e :
logging . warning ( f " Error parsing table, ignore: { e } " )
2024-08-15 09:17:36 +08:00
html + = " </tr> "
html + = " </table> "
tbls . append ( ( ( None , html ) , " " ) )
return new_line , tbls
class Pdf ( PdfParser ) :
2025-03-17 16:49:54 +08:00
def __init__ ( self ) :
super ( ) . __init__ ( )
2025-03-17 11:58:40 +08:00
2024-08-15 09:17:36 +08:00
def __call__ ( self , filename , binary = None , from_page = 0 ,
2025-03-20 09:39:32 +08:00
to_page = 100000 , zoomin = 3 , callback = None , separate_tables_figures = False ) :
2024-08-15 09:17:36 +08:00
start = timer ( )
2024-11-30 18:48:06 +08:00
first_start = start
callback ( msg = " OCR started " )
2024-08-15 09:17:36 +08:00
self . __images__ (
filename if not binary else binary ,
zoomin ,
from_page ,
to_page ,
callback
)
2024-11-30 18:48:06 +08:00
callback ( msg = " OCR finished ( {:.2f} s) " . format ( timer ( ) - start ) )
logging . info ( " OCR( {} ~ {} ): {:.2f} s " . format ( from_page , to_page , timer ( ) - start ) )
2024-08-15 09:17:36 +08:00
start = timer ( )
self . _layouts_rec ( zoomin )
2024-11-30 18:48:06 +08:00
callback ( 0.63 , " Layout analysis ( {:.2f} s) " . format ( timer ( ) - start ) )
start = timer ( )
2024-08-15 09:17:36 +08:00
self . _table_transformer_job ( zoomin )
2024-11-30 18:48:06 +08:00
callback ( 0.65 , " Table analysis ( {:.2f} s) " . format ( timer ( ) - start ) )
start = timer ( )
2024-08-15 09:17:36 +08:00
self . _text_merge ( )
2024-11-30 18:48:06 +08:00
callback ( 0.67 , " Text merged ( {:.2f} s) " . format ( timer ( ) - start ) )
2024-08-15 09:17:36 +08:00
2025-03-20 09:39:32 +08:00
if separate_tables_figures :
tbls , figures = self . _extract_table_figure ( True , zoomin , True , True , True )
self . _concat_downward ( )
logging . info ( " layouts cost: {} s " . format ( timer ( ) - first_start ) )
return [ ( b [ " text " ] , self . _line_tag ( b , zoomin ) ) for b in self . boxes ] , tbls , figures
else :
tbls = self . _extract_table_figure ( True , zoomin , True , True )
2025-08-20 17:29:15 +08:00
self . _naive_vertical_merge ( )
2025-03-20 09:39:32 +08:00
self . _concat_downward ( )
# self._filter_forpages()
logging . info ( " layouts cost: {} s " . format ( timer ( ) - first_start ) )
return [ ( b [ " text " ] , self . _line_tag ( b , zoomin ) ) for b in self . boxes ] , tbls
2024-08-15 09:17:36 +08:00
class Markdown ( MarkdownParser ) :
2025-04-25 18:35:28 +08:00
def get_picture_urls ( self , sections ) :
if not sections :
return [ ]
if isinstance ( sections , type ( " " ) ) :
text = sections
elif isinstance ( sections [ 0 ] , type ( " " ) ) :
text = sections [ 0 ]
else :
return [ ]
2025-07-15 13:03:01 +08:00
2025-04-25 18:35:28 +08:00
from bs4 import BeautifulSoup
2025-05-12 17:16:55 +08:00
html_content = markdown ( text )
2025-04-25 18:35:28 +08:00
soup = BeautifulSoup ( html_content , ' html.parser ' )
html_images = [ img . get ( ' src ' ) for img in soup . find_all ( ' img ' ) if img . get ( ' src ' ) ]
return html_images
2025-07-15 13:03:01 +08:00
2025-04-25 18:35:28 +08:00
def get_pictures ( self , text ) :
""" Download and open all images from markdown text. """
import requests
image_urls = self . get_picture_urls ( text )
images = [ ]
# Find all image URLs in text
for url in image_urls :
try :
2025-07-18 17:06:58 +08:00
# check if the url is a local file or a remote URL
if url . startswith ( ( ' http:// ' , ' https:// ' ) ) :
# For remote URLs, download the image
response = requests . get ( url , stream = True , timeout = 30 )
if response . status_code == 200 and response . headers [ ' Content-Type ' ] . startswith ( ' image/ ' ) :
img = Image . open ( BytesIO ( response . content ) ) . convert ( ' RGB ' )
images . append ( img )
else :
# For local file paths, open the image directly
from pathlib import Path
local_path = Path ( url )
if not local_path . exists ( ) :
logging . warning ( f " Local image file not found: { url } " )
continue
img = Image . open ( url ) . convert ( ' RGB ' )
2025-04-25 18:35:28 +08:00
images . append ( img )
except Exception as e :
logging . error ( f " Failed to download/open image from { url } : { e } " )
continue
2025-07-15 13:03:01 +08:00
2025-04-25 18:35:28 +08:00
return images if images else None
2025-07-15 13:03:01 +08:00
def __call__ ( self , filename , binary = None , separate_tables = True ) :
2024-08-15 09:17:36 +08:00
if binary :
encoding = find_codec ( binary )
txt = binary . decode ( encoding , errors = " ignore " )
else :
with open ( filename , " r " ) as f :
txt = f . read ( )
2025-07-15 13:03:01 +08:00
remainder , tables = self . extract_tables_and_remainder ( f ' { txt } \n ' , separate_tables = separate_tables )
2024-08-15 09:17:36 +08:00
sections = [ ]
tbls = [ ]
for sec in remainder . split ( " \n " ) :
2025-05-30 15:04:21 +08:00
if sec . strip ( ) . find ( " # " ) == 0 :
sections . append ( ( sec , " " ) )
elif sections and sections [ - 1 ] [ 0 ] . strip ( ) . find ( " # " ) == 0 :
sec_ , _ = sections . pop ( - 1 )
sections . append ( ( sec_ + " \n " + sec , " " ) )
2024-08-15 09:17:36 +08:00
else :
2025-05-30 15:04:21 +08:00
sections . append ( ( sec , " " ) )
2024-08-15 09:17:36 +08:00
for table in tables :
tbls . append ( ( ( None , markdown ( table , extensions = [ ' markdown.extensions.tables ' ] ) ) , " " ) )
return sections , tbls
2025-08-14 12:14:03 +08:00
def load_from_xml_v2 ( baseURI , rels_item_xml ) :
"""
Return | _SerializedRelationships | instance loaded with the
relationships contained in * rels_item_xml * . Returns an empty
collection if * rels_item_xml * is | None | .
"""
srels = _SerializedRelationships ( )
if rels_item_xml is not None :
rels_elm = parse_xml ( rels_item_xml )
for rel_elm in rels_elm . Relationship_lst :
if rel_elm . target_ref in ( ' ../NULL ' , ' NULL ' ) :
continue
srels . _srels . append ( _SerializedRelationship ( baseURI , rel_elm ) )
return srels
2024-08-15 09:17:36 +08:00
def chunk ( filename , binary = None , from_page = 0 , to_page = 100000 ,
2025-03-17 16:49:54 +08:00
lang = " Chinese " , callback = None , * * kwargs ) :
2024-08-15 09:17:36 +08:00
"""
Supported file formats are docx , pdf , excel , txt .
This method apply the naive ways to chunk files .
Successive text will be sliced into pieces using ' delimiter ' .
Next , these successive pieces are merge into chunks whose token number is no more than ' Max token number ' .
"""
2024-12-01 22:28:00 +08:00
is_english = lang . lower ( ) == " english " # is_english(cks)
2024-08-15 09:17:36 +08:00
parser_config = kwargs . get (
" parser_config " , {
2025-07-30 19:41:09 +08:00
" chunk_token_num " : 512 , " delimiter " : " \n !?。;!? " , " layout_recognize " : " DeepDOC " } )
2024-08-15 09:17:36 +08:00
doc = {
" docnm_kwd " : filename ,
" title_tks " : rag_tokenizer . tokenize ( re . sub ( r " \ .[a-zA-Z]+$ " , " " , filename ) )
}
doc [ " title_sm_tks " ] = rag_tokenizer . fine_grained_tokenize ( doc [ " title_tks " ] )
res = [ ]
pdf_parser = None
2025-04-25 18:35:28 +08:00
section_images = None
2024-08-15 09:17:36 +08:00
if re . search ( r " \ .docx$ " , filename , re . IGNORECASE ) :
callback ( 0.1 , " Start to parse. " )
2025-03-20 11:24:44 +08:00
try :
vision_model = LLMBundle ( kwargs [ " tenant_id " ] , LLMType . IMAGE2TEXT )
callback ( 0.15 , " Visual model detected. Attempting to enhance figure extraction... " )
except Exception :
vision_model = None
2025-08-14 12:14:03 +08:00
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships . load_from_xml = load_from_xml_v2
2024-12-01 22:28:00 +08:00
sections , tables = Docx ( ) ( filename , binary )
2024-08-15 09:17:36 +08:00
2025-03-20 11:24:44 +08:00
if vision_model :
2025-06-18 09:41:09 +08:00
figures_data = vision_figure_parser_figure_data_wrapper ( sections )
2025-03-20 11:24:44 +08:00
try :
docx_vision_parser = VisionFigureParser ( vision_model = vision_model , figures_data = figures_data , * * kwargs )
boosted_figures = docx_vision_parser ( callback = callback )
tables . extend ( boosted_figures )
except Exception as e :
callback ( 0.6 , f " Visual model error: { e } . Skipping figure parsing enhancement. " )
res = tokenize_table ( tables , doc , is_english )
2024-08-15 09:17:36 +08:00
callback ( 0.8 , " Finish parsing. " )
2025-03-20 11:24:44 +08:00
2024-08-15 09:17:36 +08:00
st = timer ( )
chunks , images = naive_merge_docx (
sections , int ( parser_config . get (
" chunk_token_num " , 128 ) ) , parser_config . get (
" delimiter " , " \n !?。;!? " ) )
if kwargs . get ( " section_only " , False ) :
return chunks
2025-04-25 18:35:28 +08:00
res . extend ( tokenize_chunks_with_images ( chunks , doc , is_english , images ) )
2024-11-14 17:13:48 +08:00
logging . info ( " naive_merge( {} ): {} " . format ( filename , timer ( ) - st ) )
2024-08-15 09:17:36 +08:00
return res
elif re . search ( r " \ .pdf$ " , filename , re . IGNORECASE ) :
2025-03-18 14:52:20 +08:00
layout_recognizer = parser_config . get ( " layout_recognize " , " DeepDOC " )
2025-03-20 14:50:59 +08:00
if isinstance ( layout_recognizer , bool ) :
layout_recognizer = " DeepDOC " if layout_recognizer else " Plain Text "
2025-03-20 10:48:38 +08:00
callback ( 0.1 , " Start to parse. " )
2025-03-18 14:52:20 +08:00
if layout_recognizer == " DeepDOC " :
pdf_parser = Pdf ( )
2025-03-20 09:39:32 +08:00
try :
vision_model = LLMBundle ( kwargs [ " tenant_id " ] , LLMType . IMAGE2TEXT )
2025-03-20 10:48:38 +08:00
callback ( 0.15 , " Visual model detected. Attempting to enhance figure extraction... " )
2025-03-20 09:39:32 +08:00
except Exception :
vision_model = None
if vision_model :
sections , tables , figures = pdf_parser ( filename if not binary else binary , from_page = from_page , to_page = to_page , callback = callback , separate_tables_figures = True )
2025-03-20 10:48:38 +08:00
callback ( 0.5 , " Basic parsing complete. Proceeding with figure enhancement... " )
try :
pdf_vision_parser = VisionFigureParser ( vision_model = vision_model , figures_data = figures , * * kwargs )
boosted_figures = pdf_vision_parser ( callback = callback )
tables . extend ( boosted_figures )
except Exception as e :
callback ( 0.6 , f " Visual model error: { e } . Skipping figure parsing enhancement. " )
tables . extend ( figures )
2025-03-20 09:39:32 +08:00
else :
sections , tables = pdf_parser ( filename if not binary else binary , from_page = from_page , to_page = to_page , callback = callback )
res = tokenize_table ( tables , doc , is_english )
2025-03-20 10:48:38 +08:00
callback ( 0.8 , " Finish parsing. " )
2025-03-20 09:39:32 +08:00
2025-03-18 14:52:20 +08:00
else :
2025-03-20 09:39:32 +08:00
if layout_recognizer == " Plain Text " :
pdf_parser = PlainParser ( )
else :
vision_model = LLMBundle ( kwargs [ " tenant_id " ] , LLMType . IMAGE2TEXT , llm_name = layout_recognizer , lang = lang )
pdf_parser = VisionParser ( vision_model = vision_model , * * kwargs )
2025-03-18 14:52:20 +08:00
2025-03-20 09:39:32 +08:00
sections , tables = pdf_parser ( filename if not binary else binary , from_page = from_page , to_page = to_page ,
callback = callback )
res = tokenize_table ( tables , doc , is_english )
2025-03-20 10:48:38 +08:00
callback ( 0.8 , " Finish parsing. " )
2024-08-15 09:17:36 +08:00
2025-03-12 19:20:50 +08:00
elif re . search ( r " \ .(csv|xlsx?)$ " , filename , re . IGNORECASE ) :
2024-08-15 09:17:36 +08:00
callback ( 0.1 , " Start to parse. " )
excel_parser = ExcelParser ( )
2024-09-20 15:33:38 +08:00
if parser_config . get ( " html4excel " ) :
2024-09-29 10:29:56 +08:00
sections = [ ( _ , " " ) for _ in excel_parser . html ( binary , 12 ) if _ ]
2024-09-20 15:33:38 +08:00
else :
2024-09-29 10:29:56 +08:00
sections = [ ( _ , " " ) for _ in excel_parser ( binary ) if _ ]
2025-08-15 17:00:34 +08:00
parser_config [ " chunk_token_num " ] = 12800
2024-08-15 09:17:36 +08:00
elif re . search ( r " \ .(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$ " , filename , re . IGNORECASE ) :
callback ( 0.1 , " Start to parse. " )
2024-09-29 10:29:56 +08:00
sections = TxtParser ( ) ( filename , binary ,
2024-08-15 09:17:36 +08:00
parser_config . get ( " chunk_token_num " , 128 ) ,
parser_config . get ( " delimiter " , " \n !?;。;!? " ) )
callback ( 0.8 , " Finish parsing. " )
2024-10-09 19:37:32 +08:00
2024-08-15 09:17:36 +08:00
elif re . search ( r " \ .(md|markdown)$ " , filename , re . IGNORECASE ) :
callback ( 0.1 , " Start to parse. " )
2025-04-25 18:35:28 +08:00
markdown_parser = Markdown ( int ( parser_config . get ( " chunk_token_num " , 128 ) ) )
2025-07-15 13:03:01 +08:00
sections , tables = markdown_parser ( filename , binary , separate_tables = False )
2025-04-25 18:35:28 +08:00
# Process images for each section
section_images = [ ]
for section_text , _ in sections :
images = markdown_parser . get_pictures ( section_text ) if section_text else None
if images :
# If multiple images found, combine them using concat_img
combined_image = reduce ( concat_img , images ) if len ( images ) > 1 else images [ 0 ]
section_images . append ( combined_image )
else :
section_images . append ( None )
2025-07-15 13:03:01 +08:00
2024-12-01 22:28:00 +08:00
res = tokenize_table ( tables , doc , is_english )
2024-08-15 09:17:36 +08:00
callback ( 0.8 , " Finish parsing. " )
elif re . search ( r " \ .(htm|html)$ " , filename , re . IGNORECASE ) :
callback ( 0.1 , " Start to parse. " )
sections = HtmlParser ( ) ( filename , binary )
2024-09-29 10:29:56 +08:00
sections = [ ( _ , " " ) for _ in sections if _ ]
2024-08-15 09:17:36 +08:00
callback ( 0.8 , " Finish parsing. " )
2025-07-30 09:48:20 +08:00
elif re . search ( r " \ .(json|jsonl|ldjson)$ " , filename , re . IGNORECASE ) :
2024-08-15 09:17:36 +08:00
callback ( 0.1 , " Start to parse. " )
2024-12-03 19:02:03 +08:00
chunk_token_num = int ( parser_config . get ( " chunk_token_num " , 128 ) )
sections = JsonParser ( chunk_token_num ) ( binary )
2024-09-29 10:29:56 +08:00
sections = [ ( _ , " " ) for _ in sections if _ ]
2024-08-15 09:17:36 +08:00
callback ( 0.8 , " Finish parsing. " )
elif re . search ( r " \ .doc$ " , filename , re . IGNORECASE ) :
callback ( 0.1 , " Start to parse. " )
binary = BytesIO ( binary )
doc_parsed = parser . from_buffer ( binary )
2024-11-22 11:05:06 +08:00
if doc_parsed . get ( ' content ' , None ) is not None :
sections = doc_parsed [ ' content ' ] . split ( ' \n ' )
sections = [ ( _ , " " ) for _ in sections if _ ]
callback ( 0.8 , " Finish parsing. " )
else :
callback ( 0.8 , f " tika.parser got empty content from { filename } . " )
logging . warning ( f " tika.parser got empty content from { filename } . " )
return [ ]
2024-08-15 09:17:36 +08:00
else :
raise NotImplementedError (
" file type not supported yet(pdf, xlsx, doc, docx, txt supported) " )
st = timer ( )
2025-04-25 18:35:28 +08:00
if section_images :
# if all images are None, set section_images to None
if all ( image is None for image in section_images ) :
section_images = None
if section_images :
chunks , images = naive_merge_with_images ( sections , section_images ,
int ( parser_config . get (
" chunk_token_num " , 128 ) ) , parser_config . get (
" delimiter " , " \n !?。;!? " ) )
if kwargs . get ( " section_only " , False ) :
return chunks
2025-07-15 13:03:01 +08:00
2025-04-25 18:35:28 +08:00
res . extend ( tokenize_chunks_with_images ( chunks , doc , is_english , images ) )
else :
chunks = naive_merge (
sections , int ( parser_config . get (
" chunk_token_num " , 128 ) ) , parser_config . get (
" delimiter " , " \n !?。;!? " ) )
if kwargs . get ( " section_only " , False ) :
return chunks
res . extend ( tokenize_chunks ( chunks , doc , is_english , pdf_parser ) )
2025-07-15 13:03:01 +08:00
2024-11-14 17:13:48 +08:00
logging . info ( " naive_merge( {} ): {} " . format ( filename , timer ( ) - st ) )
2024-08-15 09:17:36 +08:00
return res
if __name__ == " __main__ " :
import sys
def dummy ( prog = None , msg = " " ) :
pass
chunk ( sys . argv [ 1 ] , from_page = 0 , to_page = 10 , callback = dummy )