mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-31 09:50:00 +00:00 
			
		
		
		
	 bc701d7b4c
			
		
	
	
		bc701d7b4c
		
			
		
	
	
	
	
		
			
			### What problem does this PR solve? Edit chunk shall update instead of insert it. Close #3679 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
		
			
				
	
	
		
			137 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			137 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | ||
| #  you may not use this file except in compliance with the License.
 | ||
| #  You may obtain a copy of the License at
 | ||
| #
 | ||
| #      http://www.apache.org/licenses/LICENSE-2.0
 | ||
| #
 | ||
| #  Unless required by applicable law or agreed to in writing, software
 | ||
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | ||
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||
| #  See the License for the specific language governing permissions and
 | ||
| #  limitations under the License.
 | ||
| #
 | ||
| 
 | ||
| from docx import Document
 | ||
| import re
 | ||
| import pandas as pd
 | ||
| from collections import Counter
 | ||
| from rag.nlp import rag_tokenizer
 | ||
| from io import BytesIO
 | ||
| 
 | ||
| 
 | ||
| class RAGFlowDocxParser:
 | ||
| 
 | ||
|     def __extract_table_content(self, tb):
 | ||
|         df = []
 | ||
|         for row in tb.rows:
 | ||
|             df.append([c.text for c in row.cells])
 | ||
|         return self.__compose_table_content(pd.DataFrame(df))
 | ||
| 
 | ||
|     def __compose_table_content(self, df):
 | ||
| 
 | ||
|         def blockType(b):
 | ||
|             patt = [
 | ||
|                 ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 | ||
|                 (r"^(20|19)[0-9]{2}年$", "Dt"),
 | ||
|                 (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
 | ||
|                 ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 | ||
|                 (r"^第*[一二三四1-4]季度$", "Dt"),
 | ||
|                 (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
 | ||
|                 (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
 | ||
|                 ("^[0-9.,+%/ -]+$", "Nu"),
 | ||
|                 (r"^[0-9A-Z/\._~-]+$", "Ca"),
 | ||
|                 (r"^[A-Z]*[a-z' -]+$", "En"),
 | ||
|                 (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
 | ||
|                 (r"^.{1}$", "Sg")
 | ||
|             ]
 | ||
|             for p, n in patt:
 | ||
|                 if re.search(p, b):
 | ||
|                     return n
 | ||
|             tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
 | ||
|             if len(tks) > 3:
 | ||
|                 if len(tks) < 12:
 | ||
|                     return "Tx"
 | ||
|                 else:
 | ||
|                     return "Lx"
 | ||
| 
 | ||
|             if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
 | ||
|                 return "Nr"
 | ||
| 
 | ||
|             return "Ot"
 | ||
| 
 | ||
|         if len(df) < 2:
 | ||
|             return []
 | ||
|         max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
 | ||
|             1, len(df)) for j in range(len(df.iloc[i, :]))])
 | ||
|         max_type = max(max_type.items(), key=lambda x: x[1])[0]
 | ||
| 
 | ||
|         colnm = len(df.iloc[0, :])
 | ||
|         hdrows = [0]  # header is not nessesarily appear in the first line
 | ||
|         if max_type == "Nu":
 | ||
|             for r in range(1, len(df)):
 | ||
|                 tys = Counter([blockType(str(df.iloc[r, j]))
 | ||
|                               for j in range(len(df.iloc[r, :]))])
 | ||
|                 tys = max(tys.items(), key=lambda x: x[1])[0]
 | ||
|                 if tys != max_type:
 | ||
|                     hdrows.append(r)
 | ||
| 
 | ||
|         lines = []
 | ||
|         for i in range(1, len(df)):
 | ||
|             if i in hdrows:
 | ||
|                 continue
 | ||
|             hr = [r - i for r in hdrows]
 | ||
|             hr = [r for r in hr if r < 0]
 | ||
|             t = len(hr) - 1
 | ||
|             while t > 0:
 | ||
|                 if hr[t] - hr[t - 1] > 1:
 | ||
|                     hr = hr[t:]
 | ||
|                     break
 | ||
|                 t -= 1
 | ||
|             headers = []
 | ||
|             for j in range(len(df.iloc[i, :])):
 | ||
|                 t = []
 | ||
|                 for h in hr:
 | ||
|                     x = str(df.iloc[i + h, j]).strip()
 | ||
|                     if x in t:
 | ||
|                         continue
 | ||
|                     t.append(x)
 | ||
|                 t = ",".join(t)
 | ||
|                 if t:
 | ||
|                     t += ": "
 | ||
|                 headers.append(t)
 | ||
|             cells = []
 | ||
|             for j in range(len(df.iloc[i, :])):
 | ||
|                 if not str(df.iloc[i, j]):
 | ||
|                     continue
 | ||
|                 cells.append(headers[j] + str(df.iloc[i, j]))
 | ||
|             lines.append(";".join(cells))
 | ||
| 
 | ||
|         if colnm > 3:
 | ||
|             return lines
 | ||
|         return ["\n".join(lines)]
 | ||
| 
 | ||
|     def __call__(self, fnm, from_page=0, to_page=100000000):
 | ||
|         self.doc = Document(fnm) if isinstance(
 | ||
|             fnm, str) else Document(BytesIO(fnm))
 | ||
|         pn = 0 # parsed page
 | ||
|         secs = [] # parsed contents
 | ||
|         for p in self.doc.paragraphs:
 | ||
|             if pn > to_page:
 | ||
|                 break
 | ||
| 
 | ||
|             runs_within_single_paragraph = [] # save runs within the range of pages
 | ||
|             for run in p.runs:
 | ||
|                 if pn > to_page:
 | ||
|                     break
 | ||
|                 if from_page <= pn < to_page and p.text.strip():
 | ||
|                     runs_within_single_paragraph.append(run.text) # append run.text first
 | ||
| 
 | ||
|                 # wrap page break checker into a static method
 | ||
|                 if 'lastRenderedPageBreak' in run._element.xml:
 | ||
|                     pn += 1
 | ||
| 
 | ||
|             secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
 | ||
| 
 | ||
|         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
 | ||
|         return secs, tbls
 |