Perf: ignore concate between rows. (#8507)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2025-06-26 14:55:37 +08:00 committed by GitHub
parent 0eb90e73a5
commit 6d256ff0f5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 5 additions and 1640 deletions

View File

@ -479,6 +479,9 @@ class RAGFlowPdfParser:
self.boxes = bxs
def _concat_downward(self, concat_between_pages=True):
self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
return
# count boxes in the same row as a feature
for i in range(len(self.boxes)):
mh = self.mean_height[self.boxes[i]["page_number"] - 1]
@ -1136,7 +1139,8 @@ class RAGFlowPdfParser:
need_image, zoomin, return_html, False)
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
def remove_tag(self, txt):
@staticmethod
def remove_tag(txt):
return re.sub(r"@@[\t0-9.-]+?##", "", txt)
def crop(self, text, ZM=3, need_position=False):

File diff suppressed because it is too large Load Diff