feat(config): Adding a configurable way of setting the cache directory for constrained environments
This commit is contained in:
parent
8c22396d8b
commit
00026b5f8b
@ -525,7 +525,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
if os.path.exists(cache_file_path):
|
||||
html = ""
|
||||
@ -725,7 +725,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||
)
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
@ -10,7 +10,7 @@ import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
|
@ -23,14 +23,14 @@ class AsyncWebCrawler:
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
always_by_pass_cache: bool = False,
|
||||
base_directory: str = str(Path.home()),
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
**kwargs,
|
||||
):
|
||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||
**kwargs
|
||||
)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
# self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
|
@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
|
||||
# chromedriver_autoinstaller.install()
|
||||
# import chromedriver_autoinstaller
|
||||
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
||||
# chromedriver_path = chromedriver_autoinstaller.install()
|
||||
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
||||
@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||
if os.path.exists(cache_file_path):
|
||||
with open(cache_file_path, "r") as f:
|
||||
return sanitize_input_encode(f.read())
|
||||
@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||
|
||||
# Store in cache
|
||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
||||
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
|
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Optional, Tuple
|
||||
|
||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
||||
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(DB_PATH, exist_ok=True)
|
||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||
|
||||
|
@ -56,7 +56,7 @@ def set_model_device(model):
|
||||
|
||||
@lru_cache()
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
|
@ -60,7 +60,7 @@ def get_system_memory():
|
||||
raise OSError("Unsupported operating system")
|
||||
|
||||
def get_home_folder():
|
||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
|
@ -20,7 +20,7 @@ class WebCrawler:
|
||||
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
|
||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||
self.always_by_pass_cache = always_by_pass_cache
|
||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
||||
self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||
init_db()
|
||||
|
@ -13,7 +13,7 @@ AsyncWebCrawler(
|
||||
|
||||
# Cache Settings
|
||||
always_by_pass_cache: bool = False, # Always bypass cache
|
||||
base_directory: str = str(Path.home()), # Base directory for cache
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
|
||||
|
||||
# Network Settings
|
||||
proxy: str = None, # Simple proxy URL
|
||||
|
2
setup.py
2
setup.py
@ -8,7 +8,7 @@ import sys
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
# If the folder already exists, remove the cache folder
|
||||
crawl4ai_folder = Path.home() / ".crawl4ai"
|
||||
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
|
||||
if cache_folder.exists():
|
||||
|
Loading…
x
Reference in New Issue
Block a user