diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 25e8f8463..5cbd1330f 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -23,6 +23,13 @@ class RAGFlowPptParser: def __init__(self): super().__init__() + def __get_bulleted_text(self, paragraph): + is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) ) + if is_bulleted: + return f"{' '* paragraph.level}.{paragraph.text}" + else: + return paragraph.text + def __extract(self, shape): if shape.shape_type == 19: tb = shape.table @@ -33,7 +40,12 @@ class RAGFlowPptParser: return "\n".join(rows) if shape.has_text_frame: - return shape.text_frame.text + text_frame = shape.text_frame + texts = [] + for paragraph in text_frame.paragraphs: + if paragraph.text.strip(): + texts.append(self.__get_bulleted_text(paragraph)) + return "\n".join(texts) if shape.shape_type == 6: texts = [] @@ -65,4 +77,4 @@ class RAGFlowPptParser: logging.exception(e) txts.append("\n".join(texts)) - return txts + return txts \ No newline at end of file