| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  | # -*- coding: utf-8 -*- | 
					
						
							| 
									
										
										
										
											2024-05-30 09:25:05 +08:00
										 |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License. | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  | from rag.nlp import find_codec | 
					
						
							|  |  |  | import readability | 
					
						
							|  |  |  | import html_text | 
					
						
							|  |  |  | import chardet | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-29 13:19:01 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  | def get_encoding(file): | 
					
						
							|  |  |  |     with open(file,'rb') as f: | 
					
						
							|  |  |  |         tmp = chardet.detect(f.read()) | 
					
						
							|  |  |  |         return tmp['encoding'] | 
					
						
							| 
									
										
										
										
											2024-05-30 09:25:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-29 13:19:01 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  | class RAGFlowHtmlParser: | 
					
						
							|  |  |  |     def __call__(self, fnm, binary=None): | 
					
						
							|  |  |  |         txt = "" | 
					
						
							|  |  |  |         if binary: | 
					
						
							|  |  |  |             encoding = find_codec(binary) | 
					
						
							|  |  |  |             txt = binary.decode(encoding, errors="ignore") | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             with open(fnm, "r",encoding=get_encoding(fnm)) as f: | 
					
						
							|  |  |  |                 txt = f.read() | 
					
						
							| 
									
										
										
										
											2024-08-06 16:42:14 +08:00
										 |  |  |         return self.parser_txt(txt) | 
					
						
							| 
									
										
										
										
											2024-05-30 09:25:05 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-06 16:42:14 +08:00
										 |  |  |     @classmethod | 
					
						
							|  |  |  |     def parser_txt(cls, txt): | 
					
						
							| 
									
										
										
										
											2024-12-08 14:21:12 +08:00
										 |  |  |         if not isinstance(txt, str): | 
					
						
							| 
									
										
										
										
											2024-08-06 16:42:14 +08:00
										 |  |  |             raise TypeError("txt type should be str!") | 
					
						
							| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  |         html_doc = readability.Document(txt) | 
					
						
							|  |  |  |         title = html_doc.title() | 
					
						
							|  |  |  |         content = html_text.extract_text(html_doc.summary(html_partial=True)) | 
					
						
							| 
									
										
										
										
											2024-08-06 16:42:14 +08:00
										 |  |  |         txt = f"{title}\n{content}" | 
					
						
							| 
									
										
										
										
											2024-05-30 09:12:55 +08:00
										 |  |  |         sections = txt.split("\n") | 
					
						
							|  |  |  |         return sections |