mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-06-26 22:19:57 +00:00
Perf: ignore concate between rows. (#8507)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
0eb90e73a5
commit
6d256ff0f5
@ -479,6 +479,9 @@ class RAGFlowPdfParser:
|
||||
self.boxes = bxs
|
||||
|
||||
def _concat_downward(self, concat_between_pages=True):
|
||||
self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
|
||||
return
|
||||
|
||||
# count boxes in the same row as a feature
|
||||
for i in range(len(self.boxes)):
|
||||
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
|
||||
@ -1136,7 +1139,8 @@ class RAGFlowPdfParser:
|
||||
need_image, zoomin, return_html, False)
|
||||
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
||||
|
||||
def remove_tag(self, txt):
|
||||
@staticmethod
|
||||
def remove_tag(txt):
|
||||
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
|
||||
|
||||
def crop(self, text, ZM=3, need_position=False):
|
||||
|
1639
rag/res/ner.json
1639
rag/res/ner.json
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user