| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2024-01-19 19:51:57 +08:00
										 |  |  | #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | # | 
					
						
							|  |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License. | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | import base64 | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import re | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | from io import BytesIO | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-30 12:38:09 +08:00
										 |  |  | import pdfplumber | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | from PIL import Image | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | from cachetools import LRUCache, cached | 
					
						
							|  |  |  | from ruamel.yaml import YAML | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-17 20:20:42 +08:00
										 |  |  | from api.db import FileType | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE") | 
					
						
							| 
									
										
										
										
											2024-01-17 09:43:27 +08:00
										 |  |  | RAG_BASE = os.getenv("RAG_BASE") | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | def get_project_base_directory(*args): | 
					
						
							|  |  |  |     global PROJECT_BASE | 
					
						
							|  |  |  |     if PROJECT_BASE is None: | 
					
						
							|  |  |  |         PROJECT_BASE = os.path.abspath( | 
					
						
							|  |  |  |             os.path.join( | 
					
						
							|  |  |  |                 os.path.dirname(os.path.realpath(__file__)), | 
					
						
							|  |  |  |                 os.pardir, | 
					
						
							|  |  |  |                 os.pardir, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if args: | 
					
						
							|  |  |  |         return os.path.join(PROJECT_BASE, *args) | 
					
						
							|  |  |  |     return PROJECT_BASE | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-17 09:43:27 +08:00
										 |  |  | def get_rag_directory(*args): | 
					
						
							|  |  |  |     global RAG_BASE | 
					
						
							|  |  |  |     if RAG_BASE is None: | 
					
						
							|  |  |  |         RAG_BASE = os.path.abspath( | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  |             os.path.join( | 
					
						
							|  |  |  |                 os.path.dirname(os.path.realpath(__file__)), | 
					
						
							|  |  |  |                 os.pardir, | 
					
						
							|  |  |  |                 os.pardir, | 
					
						
							|  |  |  |                 os.pardir, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |     if args: | 
					
						
							| 
									
										
										
										
											2024-01-17 09:43:27 +08:00
										 |  |  |         return os.path.join(RAG_BASE, *args) | 
					
						
							|  |  |  |     return RAG_BASE | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-17 09:43:27 +08:00
										 |  |  | def get_rag_python_directory(*args): | 
					
						
							|  |  |  |     return get_rag_directory("python", *args) | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-29 17:57:40 +08:00
										 |  |  | def get_home_cache_dir(): | 
					
						
							| 
									
										
										
										
											2024-04-30 11:04:14 +08:00
										 |  |  |     dir = os.path.join(os.path.expanduser('~'), ".ragflow") | 
					
						
							| 
									
										
										
										
											2024-04-29 17:57:40 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         os.mkdir(dir) | 
					
						
							|  |  |  |     except OSError as error: | 
					
						
							|  |  |  |         pass | 
					
						
							|  |  |  |     return dir | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  | @cached(cache=LRUCache(maxsize=10)) | 
					
						
							|  |  |  | def load_json_conf(conf_path): | 
					
						
							|  |  |  |     if os.path.isabs(conf_path): | 
					
						
							|  |  |  |         json_conf_path = conf_path | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         json_conf_path = os.path.join(get_project_base_directory(), conf_path) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(json_conf_path) as f: | 
					
						
							|  |  |  |             return json.load(f) | 
					
						
							|  |  |  |     except BaseException: | 
					
						
							|  |  |  |         raise EnvironmentError( | 
					
						
							|  |  |  |             "loading json file config from '{}' failed!".format(json_conf_path) | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def dump_json_conf(config_data, conf_path): | 
					
						
							|  |  |  |     if os.path.isabs(conf_path): | 
					
						
							|  |  |  |         json_conf_path = conf_path | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         json_conf_path = os.path.join(get_project_base_directory(), conf_path) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(json_conf_path, "w") as f: | 
					
						
							|  |  |  |             json.dump(config_data, f, indent=4) | 
					
						
							|  |  |  |     except BaseException: | 
					
						
							|  |  |  |         raise EnvironmentError( | 
					
						
							|  |  |  |             "loading json file config from '{}' failed!".format(json_conf_path) | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def load_json_conf_real_time(conf_path): | 
					
						
							|  |  |  |     if os.path.isabs(conf_path): | 
					
						
							|  |  |  |         json_conf_path = conf_path | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         json_conf_path = os.path.join(get_project_base_directory(), conf_path) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(json_conf_path) as f: | 
					
						
							|  |  |  |             return json.load(f) | 
					
						
							|  |  |  |     except BaseException: | 
					
						
							|  |  |  |         raise EnvironmentError( | 
					
						
							|  |  |  |             "loading json file config from '{}' failed!".format(json_conf_path) | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def load_yaml_conf(conf_path): | 
					
						
							|  |  |  |     if not os.path.isabs(conf_path): | 
					
						
							|  |  |  |         conf_path = os.path.join(get_project_base_directory(), conf_path) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(conf_path) as f: | 
					
						
							|  |  |  |             yaml = YAML(typ='safe', pure=True) | 
					
						
							|  |  |  |             return yaml.load(f) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         raise EnvironmentError( | 
					
						
							|  |  |  |             "loading yaml file config from {} failed:".format(conf_path), e | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def rewrite_yaml_conf(conf_path, config): | 
					
						
							|  |  |  |     if not os.path.isabs(conf_path): | 
					
						
							|  |  |  |         conf_path = os.path.join(get_project_base_directory(), conf_path) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(conf_path, "w") as f: | 
					
						
							|  |  |  |             yaml = YAML(typ="safe") | 
					
						
							|  |  |  |             yaml.dump(config, f) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         raise EnvironmentError( | 
					
						
							|  |  |  |             "rewrite yaml file config {} failed:".format(conf_path), e | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def rewrite_json_file(filepath, json_data): | 
					
						
							|  |  |  |     with open(filepath, "w") as f: | 
					
						
							|  |  |  |         json.dump(json_data, f, indent=4, separators=(",", ": ")) | 
					
						
							|  |  |  |     f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def filename_type(filename): | 
					
						
							|  |  |  |     filename = filename.lower() | 
					
						
							|  |  |  |     if re.match(r".*\.pdf$", filename): | 
					
						
							|  |  |  |         return FileType.PDF.value | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  |     if re.match( | 
					
						
							| 
									
										
										
										
											2024-05-15 16:34:28 +08:00
										 |  |  |             r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename): | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  |         return FileType.DOC.value | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  |     if re.match( | 
					
						
							|  |  |  |             r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): | 
					
						
							| 
									
										
										
										
											2024-01-15 08:46:22 +08:00
										 |  |  |         return FileType.AURAL.value | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): | 
					
						
							| 
									
										
										
										
											2024-04-26 17:21:53 +08:00
										 |  |  |         return FileType.VISUAL.value | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return FileType.OTHER.value | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def thumbnail(filename, blob): | 
					
						
							|  |  |  |     filename = filename.lower() | 
					
						
							|  |  |  |     if re.match(r".*\.pdf$", filename): | 
					
						
							| 
									
										
										
										
											2024-04-30 12:38:09 +08:00
										 |  |  |         pdf = pdfplumber.open(BytesIO(blob)) | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  |         buffered = BytesIO() | 
					
						
							| 
									
										
										
										
											2024-04-30 12:38:09 +08:00
										 |  |  |         pdf.pages[0].to_image().annotated.save(buffered, format="png") | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  |         return "data:image/png;base64," + \ | 
					
						
							|  |  |  |             base64.b64encode(buffered.getvalue()).decode("utf-8") | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |         image = Image.open(BytesIO(blob)) | 
					
						
							|  |  |  |         image.thumbnail((30, 30)) | 
					
						
							|  |  |  |         buffered = BytesIO() | 
					
						
							|  |  |  |         image.save(buffered, format="png") | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  |         return "data:image/png;base64," + \ | 
					
						
							|  |  |  |             base64.b64encode(buffered.getvalue()).decode("utf-8") | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if re.match(r".*\.(ppt|pptx)$", filename): | 
					
						
							|  |  |  |         import aspose.slides as slides | 
					
						
							|  |  |  |         import aspose.pydrawing as drawing | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             with slides.Presentation(BytesIO(blob)) as presentation: | 
					
						
							|  |  |  |                 buffered = BytesIO() | 
					
						
							| 
									
										
										
										
											2024-03-27 11:33:46 +08:00
										 |  |  |                 presentation.slides[0].get_thumbnail(0.03, 0.03).save( | 
					
						
							|  |  |  |                     buffered, drawing.imaging.ImageFormat.png) | 
					
						
							|  |  |  |                 return "data:image/png;base64," + \ | 
					
						
							|  |  |  |                     base64.b64encode(buffered.getvalue()).decode("utf-8") | 
					
						
							| 
									
										
										
										
											2024-02-07 19:27:23 +08:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-26 19:51:35 +08:00
										 |  |  | def traversal_files(base): | 
					
						
							|  |  |  |     for root, ds, fs in os.walk(base): | 
					
						
							|  |  |  |         for f in fs: | 
					
						
							|  |  |  |             fullname = os.path.join(root, f) | 
					
						
							|  |  |  |             yield fullname |