mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-03 18:36:21 +00:00
Fix: improve PDF text type detection by expanding regex content (#11432)
- Add whitespace validation to the PDF English text checking regex
- Reduce false negatives in English PDF content recognition
### What problem does this PR solve?
The core idea is to **expand the regex content used for English text
detection** so it can accommodate more valid characters commonly found
in English PDFs. The modifications include:
- Adding support for **space** in the regex.
- Ensuring the update does not reduce existing detection accuracy.
### Type of change
- [✅] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
1845daf41f
commit
1033a3ae26
@ -1091,7 +1091,7 @@ class RAGFlowPdfParser:
|
||||
|
||||
logging.debug("Images converted.")
|
||||
self.is_english = [
|
||||
re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
|
||||
re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i])))))
|
||||
for i in range(len(self.page_chars))
|
||||
]
|
||||
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
|
||||
@ -1148,7 +1148,7 @@ class RAGFlowPdfParser:
|
||||
|
||||
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
|
||||
bxes = [b for bxs in self.boxes for b in bxs]
|
||||
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
||||
self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
|
||||
|
||||
logging.debug(f"Is it English: {self.is_english}")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user