From a68cbb232bd44a86b279c2a493a2a66d87a2e112 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 7 Mar 2025 20:55:56 +0800 Subject: [PATCH] feat(browser): add standalone CDP browser launch and lxml extraction strategy Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai --- crawl4ai/__init__.py | 2 + crawl4ai/browser_manager.py | 13 +- crawl4ai/browser_profiler.py | 229 ++++++++++- crawl4ai/cli.py | 105 ++++- crawl4ai/extraction_strategy.py | 370 +++++++++++++++++- .../extraction_strategies_examples.py | 2 +- .../examples/llm_extraction_openai_pricing.py | 2 +- docs/examples/llm_markdown_generator.py | 2 +- docs/examples/quickstart_async.config.py | 2 +- docs/examples/quickstart_async.py | 2 +- docs/examples/quickstart_sync.py | 2 +- docs/examples/tutorial_v0.5.py | 2 +- docs/md_v2/api/strategies.md | 4 +- docs/md_v2/blog/releases/0.5.0.md | 6 +- docs/md_v2/core/quickstart.md | 2 +- docs/md_v2/extraction/no-llm-strategies.md | 2 +- tests/20241401/test_llm_filter.py | 2 +- ...test_chunking_and_extraction_strategies.py | 2 +- tests/browser/test_launch_standalone.py | 17 + tests/docker/test_docker.py | 2 +- tests/docker/test_serialization.py | 2 +- tests/test_web_crawler.py | 2 +- 22 files changed, 745 insertions(+), 29 deletions(-) create mode 100644 tests/browser/test_launch_standalone.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 03cce87..ff23896 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -23,6 +23,7 @@ from .extraction_strategy import ( CosineStrategy, JsonCssExtractionStrategy, JsonXPathExtractionStrategy, + JsonLxmlExtractionStrategy ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator @@ -103,6 +104,7 @@ __all__ = [ "CosineStrategy", "JsonCssExtractionStrategy", "JsonXPathExtractionStrategy", + "JsonLxmlExtractionStrategy", "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index acc45c4..4e68658 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -434,8 +434,9 @@ class BrowserManager: self.playwright = await async_playwright().start() - if self.config.use_managed_browser: - cdp_url = await self.managed_browser.start() + if self.config.cdp_url or self.config.use_managed_browser: + self.config.use_managed_browser = True + cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) contexts = self.browser.contexts if contexts: @@ -790,7 +791,10 @@ class BrowserManager: # If using a managed browser, just grab the shared default_context if self.config.use_managed_browser: context = self.default_context - page = await context.new_page() + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = await context.new_page() else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) @@ -840,6 +844,9 @@ class BrowserManager: async def close(self): """Close all browser resources and clean up.""" + if self.config.cdp_url: + return + if self.config.sleep_on_close: await asyncio.sleep(0.5) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 53a4099..be3274b 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -342,7 +342,11 @@ class BrowserProfiler: # Check if path exists and is a valid profile if not os.path.isdir(profile_path): - return None + # Chrck if profile_name itself is full path + if os.path.isabs(profile_name): + profile_path = profile_name + else: + return None # Look for profile indicators is_profile = ( @@ -541,4 +545,225 @@ class BrowserProfiler: break else: - self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") \ No newline at end of file + self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") + + + async def launch_standalone_browser(self, + browser_type: str = "chromium", + user_data_dir: Optional[str] = None, + debugging_port: int = 9222, + headless: bool = False) -> Optional[str]: + """ + Launch a standalone browser with CDP debugging enabled and keep it running + until the user presses 'q'. Returns and displays the CDP URL. + + Args: + browser_type (str): Type of browser to launch ('chromium' or 'firefox') + user_data_dir (str, optional): Path to user profile directory + debugging_port (int): Port to use for CDP debugging + headless (bool): Whether to run in headless mode + + Returns: + str: CDP URL for the browser, or None if launch failed + + Example: + ```python + profiler = BrowserProfiler() + cdp_url = await profiler.launch_standalone_browser( + user_data_dir="/path/to/profile", + debugging_port=9222 + ) + # Use cdp_url to connect to the browser + ``` + """ + # Use the provided directory if specified, otherwise create a temporary directory + if user_data_dir: + # Directory is provided directly, ensure it exists + profile_path = user_data_dir + os.makedirs(profile_path, exist_ok=True) + else: + # Create a temporary profile directory + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"temp_{timestamp}_{uuid.uuid4().hex[:6]}" + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print initial information + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="CDP") + self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP") + self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP") + + # Create managed browser instance + managed_browser = ManagedBrowser( + browser_type=browser_type, + user_data_dir=profile_path, + headless=headless, + logger=self.logger, + debugging_port=debugging_port + ) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="CDP") + await managed_browser.cleanup() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Browser terminated by user.", tag="CDP") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user wants to exit + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP") + user_done_event.set() + return + + # Check if the browser process has already exited + if managed_browser.browser_process and managed_browser.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="CDP") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + # Function to retrieve and display CDP JSON config + async def get_cdp_json(port): + import aiohttp + cdp_url = f"http://localhost:{port}" + json_url = f"{cdp_url}/json/version" + + try: + async with aiohttp.ClientSession() as session: + # Try multiple times in case the browser is still starting up + for _ in range(10): + try: + async with session.get(json_url) as response: + if response.status == 200: + data = await response.json() + return cdp_url, data + except Exception: + pass + + await asyncio.sleep(0.5) + + return cdp_url, None + except Exception as e: + self.logger.error(f"Error fetching CDP JSON: {str(e)}", tag="CDP") + return cdp_url, None + + cdp_url = None + config_json = None + + try: + # Start the browser + await managed_browser.start() + + # Check if browser started successfully + browser_process = managed_browser.browser_process + if not browser_process: + self.logger.error("Failed to start browser process.", tag="CDP") + return None + + self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") + + # Get CDP URL and JSON config + cdp_url, config_json = await get_cdp_json(debugging_port) + + if cdp_url: + self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP") + + if config_json: + # Display relevant CDP information + self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP") + if 'webSocketDebuggerUrl' in config_json: + self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP") + else: + self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP") + else: + self.logger.error(f"Failed to get CDP URL on port {debugging_port}", tag="CDP") + await managed_browser.cleanup() + return None + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="CDP") + await managed_browser.cleanup() + + self.logger.success(f"Browser closed.", tag="CDP") + + except Exception as e: + self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP") + await managed_browser.cleanup() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await managed_browser.cleanup() + + # Return the CDP URL + return cdp_url + + diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index cb6e706..659bf2b 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1,5 +1,6 @@ import click import os +import sys import time import humanize @@ -198,7 +199,24 @@ def show_examples(): # 2. Then use that profile to crawl the authenticated site: crwl https://site-requiring-login.com/dashboard -p my-profile-name -5️⃣ Sample Config Files: +5️⃣ CDP Mode for Browser Automation: + # Launch browser with CDP debugging on default port 9222 + crwl cdp + + # Use a specific profile and custom port + crwl cdp -p my-profile -P 9223 + + # Launch headless browser with CDP enabled + crwl cdp --headless + + # Launch in incognito mode (ignores profile) + crwl cdp --incognito + + # Use the CDP URL with other tools (Puppeteer, Playwright, etc.) + # The URL will be displayed in the terminal when the browser starts + + +6️⃣ Sample Config Files: browser.yml: headless: true @@ -256,7 +274,7 @@ llm_schema.json: } } -6️⃣ Advanced Usage: +7️⃣ Advanced Usage: # Combine configs with direct parameters crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920" @@ -282,7 +300,7 @@ llm_schema.json: For more documentation visit: https://github.com/unclecode/crawl4ai -7️⃣ Q&A with LLM: +8️⃣ Q&A with LLM: # Ask a question about the content crwl https://example.com -q "What is the main topic discussed?" @@ -310,7 +328,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai See full list of providers: https://docs.litellm.ai/docs/providers -8️⃣ Profile Management: +9️⃣ Profile Management: # Launch interactive profile manager crwl profiles @@ -549,11 +567,89 @@ async def manage_profiles(): # Add a separator between operations console.print("\n") + + @click.group(context_settings={"help_option_names": ["-h", "--help"]}) def cli(): """Crawl4AI CLI - Web content extraction and browser profile management tool""" pass + +@cli.command("cdp") +@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)") +@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)") +@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", + help="Browser type (default: chromium)") +@click.option("--headless", is_flag=True, help="Run browser in headless mode") +@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)") +def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool): + """Launch a standalone browser with CDP debugging enabled + + This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled, + prints the CDP URL, and keeps the browser running until you press 'q'. + + The CDP URL can be used for various automation and debugging tasks. + + Examples: + # Launch Chromium with CDP on default port 9222 + crwl cdp + + # Use a specific directory for browser data and custom port + crwl cdp --user-data-dir ~/browser-data --port 9223 + + # Launch in headless mode + crwl cdp --headless + + # Launch in incognito mode (ignores user-data-dir) + crwl cdp --incognito + """ + profiler = BrowserProfiler() + + try: + # Handle data directory + data_dir = None + if not incognito and user_data_dir: + # Expand user path (~/something) + expanded_path = os.path.expanduser(user_data_dir) + + # Create directory if it doesn't exist + if not os.path.exists(expanded_path): + console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]") + os.makedirs(expanded_path, exist_ok=True) + + data_dir = expanded_path + + # Print launch info + console.print(Panel( + f"[cyan]Launching browser with CDP debugging[/cyan]\n\n" + f"Browser type: [green]{browser_type}[/green]\n" + f"Debugging port: [yellow]{port}[/yellow]\n" + f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n" + f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n" + f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n" + f"[yellow]Press 'q' to quit when done[/yellow]", + title="CDP Browser", + border_style="cyan" + )) + + # Run the browser + cdp_url = anyio.run( + profiler.launch_standalone_browser, + browser_type, + data_dir, + port, + headless + ) + + if not cdp_url: + console.print("[red]Failed to launch browser or get CDP URL[/red]") + sys.exit(1) + + except Exception as e: + console.print(f"[red]Error launching CDP browser: {str(e)}[/red]") + sys.exit(1) + + @cli.command("crawl") @click.argument("url", required=True) @click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") @@ -737,6 +833,7 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f Other commands: crwl profiles - Manage browser profiles for identity-based crawling crwl crawl - Crawl a website with advanced options + crwl cdp - Launch browser with CDP debugging enabled crwl examples - Show more usage examples """ diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 3b70842..97512bf 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1168,7 +1168,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): super().__init__(schema, **kwargs) def _parse_html(self, html_content: str): - return BeautifulSoup(html_content, "html.parser") + # return BeautifulSoup(html_content, "html.parser") + return BeautifulSoup(html_content, "lxml") def _get_base_elements(self, parsed_html, selector: str): return parsed_html.select(selector) @@ -1187,6 +1188,373 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) +class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs["input_format"] = "html" + super().__init__(schema, **kwargs) + self._selector_cache = {} + self._xpath_cache = {} + self._result_cache = {} + + # Control selector optimization strategy + self.use_caching = kwargs.get("use_caching", True) + self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True) + + # Load lxml dependencies once + from lxml import etree, html + from lxml.cssselect import CSSSelector + self.etree = etree + self.html_parser = html + self.CSSSelector = CSSSelector + + def _parse_html(self, html_content: str): + """Parse HTML content with error recovery""" + try: + parser = self.etree.HTMLParser(recover=True, remove_blank_text=True) + return self.etree.fromstring(html_content, parser) + except Exception as e: + if self.verbose: + print(f"Error parsing HTML, falling back to alternative method: {e}") + try: + return self.html_parser.fromstring(html_content) + except Exception as e2: + if self.verbose: + print(f"Critical error parsing HTML: {e2}") + # Create minimal document as fallback + return self.etree.Element("html") + + def _optimize_selector(self, selector_str): + """Optimize common selector patterns for better performance""" + if not self.optimize_common_patterns: + return selector_str + + # Handle td:nth-child(N) pattern which is very common in table scraping + import re + if re.search(r'td:nth-child\(\d+\)', selector_str): + return selector_str # Already handled specially in _apply_selector + + # Split complex selectors into parts for optimization + parts = selector_str.split() + if len(parts) <= 1: + return selector_str + + # For very long selectors, consider using just the last specific part + if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts): + specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')] + if specific_parts: + return specific_parts[-1] # Use most specific class/id selector + + return selector_str + + def _create_selector_function(self, selector_str): + """Create a selector function that handles all edge cases""" + original_selector = selector_str + + # Try to optimize the selector if appropriate + if self.optimize_common_patterns: + selector_str = self._optimize_selector(selector_str) + + try: + # Attempt to compile the CSS selector + compiled = self.CSSSelector(selector_str) + xpath = compiled.path + + # Store XPath for later use + self._xpath_cache[selector_str] = xpath + + # Create the wrapper function that implements the selection strategy + def selector_func(element, context_sensitive=True): + cache_key = None + + # Use result caching if enabled + if self.use_caching: + # Create a cache key based on element and selector + element_id = element.get('id', '') or str(hash(element)) + cache_key = f"{element_id}::{selector_str}" + + if cache_key in self._result_cache: + return self._result_cache[cache_key] + + results = [] + try: + # Strategy 1: Direct CSS selector application (fastest) + results = compiled(element) + + # If that fails and we need context sensitivity + if not results and context_sensitive: + # Strategy 2: Try XPath with context adjustment + context_xpath = self._make_context_sensitive_xpath(xpath, element) + if context_xpath: + results = element.xpath(context_xpath) + + # Strategy 3: Handle special case - nth-child + if not results and 'nth-child' in original_selector: + results = self._handle_nth_child_selector(element, original_selector) + + # Strategy 4: Direct descendant search for class/ID selectors + if not results: + results = self._fallback_class_id_search(element, original_selector) + + # Strategy 5: Last resort - tag name search for the final part + if not results: + parts = original_selector.split() + if parts: + last_part = parts[-1] + # Extract tag name from the selector + tag_match = re.match(r'^(\w+)', last_part) + if tag_match: + tag_name = tag_match.group(1) + results = element.xpath(f".//{tag_name}") + + # Cache results if caching is enabled + if self.use_caching and cache_key: + self._result_cache[cache_key] = results + + except Exception as e: + if self.verbose: + print(f"Error applying selector '{selector_str}': {e}") + + return results + + return selector_func + + except Exception as e: + if self.verbose: + print(f"Error compiling selector '{selector_str}': {e}") + + # Fallback function for invalid selectors + return lambda element, context_sensitive=True: [] + + def _make_context_sensitive_xpath(self, xpath, element): + """Convert absolute XPath to context-sensitive XPath""" + try: + # If starts with descendant-or-self, it's already context-sensitive + if xpath.startswith('descendant-or-self::'): + return xpath + + # Remove leading slash if present + if xpath.startswith('/'): + context_xpath = f".{xpath}" + else: + context_xpath = f".//{xpath}" + + # Validate the XPath by trying it + try: + element.xpath(context_xpath) + return context_xpath + except: + # If that fails, try a simpler descendant search + return f".//{xpath.split('/')[-1]}" + except: + return None + + def _handle_nth_child_selector(self, element, selector_str): + """Special handling for nth-child selectors in tables""" + import re + results = [] + + try: + # Extract the column number from td:nth-child(N) + match = re.search(r'td:nth-child\((\d+)\)', selector_str) + if match: + col_num = match.group(1) + + # Check if there's content after the nth-child part + remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip() + + if remaining_selector: + # If there's a specific element we're looking for after the column + # Extract any tag names from the remaining selector + tag_match = re.search(r'(\w+)', remaining_selector) + tag_name = tag_match.group(1) if tag_match else '*' + results = element.xpath(f".//td[{col_num}]//{tag_name}") + else: + # Just get the column cell + results = element.xpath(f".//td[{col_num}]") + except Exception as e: + if self.verbose: + print(f"Error handling nth-child selector: {e}") + + return results + + def _fallback_class_id_search(self, element, selector_str): + """Fallback to search by class or ID""" + results = [] + + try: + # Extract class selectors (.classname) + import re + class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str) + + # Extract ID selectors (#idname) + id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str) + + # Try each class + for class_name in class_matches: + class_results = element.xpath(f".//*[contains(@class, '{class_name}')]") + results.extend(class_results) + + # Try each ID (usually more specific) + for id_name in id_matches: + id_results = element.xpath(f".//*[@id='{id_name}']") + results.extend(id_results) + except Exception as e: + if self.verbose: + print(f"Error in fallback class/id search: {e}") + + return results + + def _get_selector(self, selector_str): + """Get or create a selector function with caching""" + if selector_str not in self._selector_cache: + self._selector_cache[selector_str] = self._create_selector_function(selector_str) + return self._selector_cache[selector_str] + + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + selector_func = self._get_selector(selector) + # For base elements, we don't need context sensitivity + return selector_func(parsed_html, context_sensitive=False) + + def _get_elements(self, element, selector: str): + """Get child elements using the selector with context sensitivity""" + selector_func = self._get_selector(selector) + return selector_func(element, context_sensitive=True) + + def _get_element_text(self, element) -> str: + """Extract normalized text from element""" + try: + # Get all text nodes and normalize + text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip()) + return text + except Exception as e: + if self.verbose: + print(f"Error extracting text: {e}") + # Fallback + try: + return element.text_content().strip() + except: + return "" + + def _get_element_html(self, element) -> str: + """Get HTML string representation of element""" + try: + return self.etree.tostring(element, encoding='unicode', method='html') + except Exception as e: + if self.verbose: + print(f"Error serializing HTML: {e}") + return "" + + def _get_element_attribute(self, element, attribute: str): + """Get attribute value safely""" + try: + return element.get(attribute) + except Exception as e: + if self.verbose: + print(f"Error getting attribute '{attribute}': {e}") + return None + + def _clear_caches(self): + """Clear caches to free memory""" + if self.use_caching: + self._result_cache.clear() + +class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs["input_format"] = "html" # Force HTML input + super().__init__(schema, **kwargs) + self._selector_cache = {} + + def _parse_html(self, html_content: str): + from lxml import etree + parser = etree.HTMLParser(recover=True) + return etree.fromstring(html_content, parser) + + def _get_selector(self, selector_str): + """Get a selector function that works within the context of an element""" + if selector_str not in self._selector_cache: + from lxml.cssselect import CSSSelector + try: + # Store both the compiled selector and its xpath translation + compiled = CSSSelector(selector_str) + + # Create a function that will apply this selector appropriately + def select_func(element): + try: + # First attempt: direct CSS selector application + results = compiled(element) + if results: + return results + + # Second attempt: contextual XPath selection + # Convert the root-based XPath to a context-based XPath + xpath = compiled.path + + # If the XPath already starts with descendant-or-self, handle it specially + if xpath.startswith('descendant-or-self::'): + context_xpath = xpath + else: + # For normal XPath expressions, make them relative to current context + context_xpath = f"./{xpath.lstrip('/')}" + + results = element.xpath(context_xpath) + if results: + return results + + # Final fallback: simple descendant search for common patterns + if 'nth-child' in selector_str: + # Handle td:nth-child(N) pattern + import re + match = re.search(r'td:nth-child\((\d+)\)', selector_str) + if match: + col_num = match.group(1) + sub_selector = selector_str.split(')', 1)[-1].strip() + if sub_selector: + return element.xpath(f".//td[{col_num}]//{sub_selector}") + else: + return element.xpath(f".//td[{col_num}]") + + # Last resort: try each part of the selector separately + parts = selector_str.split() + if len(parts) > 1 and parts[-1]: + return element.xpath(f".//{parts[-1]}") + + return [] + except Exception as e: + if self.verbose: + print(f"Error applying selector '{selector_str}': {e}") + return [] + + self._selector_cache[selector_str] = select_func + except Exception as e: + if self.verbose: + print(f"Error compiling selector '{selector_str}': {e}") + + # Fallback function for invalid selectors + def fallback_func(element): + return [] + + self._selector_cache[selector_str] = fallback_func + + return self._selector_cache[selector_str] + + def _get_base_elements(self, parsed_html, selector: str): + selector_func = self._get_selector(selector) + return selector_func(parsed_html) + + def _get_elements(self, element, selector: str): + selector_func = self._get_selector(selector) + return selector_func(element) + + def _get_element_text(self, element) -> str: + return "".join(element.xpath(".//text()")).strip() + + def _get_element_html(self, element) -> str: + from lxml import etree + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): """ diff --git a/docs/examples/extraction_strategies_examples.py b/docs/examples/extraction_strategies_examples.py index 66d60cc..84192f9 100644 --- a/docs/examples/extraction_strategies_examples.py +++ b/docs/examples/extraction_strategies_examples.py @@ -11,7 +11,7 @@ import asyncio import os from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, JsonCssExtractionStrategy, diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index 27304a9..27a1c31 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,4 +1,4 @@ -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy import asyncio import os diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py index 7ff58d4..777c59b 100644 --- a/docs/examples/llm_markdown_generator.py +++ b/docs/examples/llm_markdown_generator.py @@ -1,7 +1,7 @@ import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def test_llm_filter(): diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index 3adbfc0..5efb785 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -1,6 +1,6 @@ import os, sys -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 22d2b06..aeb0d20 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -1,6 +1,6 @@ import os, sys -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig # append parent directory to system path sys.path.append( diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py index eabd574..78f3e56 100644 --- a/docs/examples/quickstart_sync.py +++ b/docs/examples/quickstart_sync.py @@ -1,6 +1,6 @@ import os import time -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py index 47b0242..d8e01e6 100644 --- a/docs/examples/tutorial_v0.5.py +++ b/docs/examples/tutorial_v0.5.py @@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig from crawl4ai import RoundRobinProxyStrategy from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai import DefaultMarkdownGenerator -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from pprint import pprint diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index a54fc63..45d4495 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -131,7 +131,7 @@ OverlappingWindowChunking( ```python from pydantic import BaseModel from crawl4ai.extraction_strategy import LLMExtractionStrategy -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig # Define schema class Article(BaseModel): @@ -198,7 +198,7 @@ result = await crawler.arun( ```python from crawl4ai.chunking_strategy import OverlappingWindowChunking -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig # Create chunking strategy chunker = OverlappingWindowChunking( diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md index 40c164e..7f38bf2 100644 --- a/docs/md_v2/blog/releases/0.5.0.md +++ b/docs/md_v2/blog/releases/0.5.0.md @@ -305,7 +305,7 @@ asyncio.run(main()) ```python from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig import asyncio llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") @@ -335,7 +335,7 @@ asyncio.run(main()) ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") @@ -401,7 +401,7 @@ print(schema) experimentation between different LLM configurations. ```python - from crawl4ai.types import LLMConfig + from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import AsyncWebCrawler, CrawlerRunConfig diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index 5a44edd..de0b7e5 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig # Generate a schema (one-time cost) html = "

Gaming Laptop

$999.99
" diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md index 827f66b..b216c0e 100644 --- a/docs/md_v2/extraction/no-llm-strategies.md +++ b/docs/md_v2/extraction/no-llm-strategies.md @@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig # Sample HTML with product information html = """ diff --git a/tests/20241401/test_llm_filter.py b/tests/20241401/test_llm_filter.py index e2c61a5..6211c42 100644 --- a/tests/20241401/test_llm_filter.py +++ b/tests/20241401/test_llm_filter.py @@ -1,7 +1,7 @@ import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def test_llm_filter(): diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py index c18d723..90e17a9 100644 --- a/tests/async/test_chunking_and_extraction_strategies.py +++ b/tests/async/test_chunking_and_extraction_strategies.py @@ -7,7 +7,7 @@ import json parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(parent_dir) -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.extraction_strategy import LLMExtractionStrategy diff --git a/tests/browser/test_launch_standalone.py b/tests/browser/test_launch_standalone.py new file mode 100644 index 0000000..d60b12f --- /dev/null +++ b/tests/browser/test_launch_standalone.py @@ -0,0 +1,17 @@ +from crawl4ai.browser_profiler import BrowserProfiler +import asyncio + + +if __name__ == "__main__": + # Test launching a standalone browser + async def test_standalone_browser(): + profiler = BrowserProfiler() + cdp_url = await profiler.launch_standalone_browser( + browser_type="chromium", + user_data_dir="~/.crawl4ai/browser_profile/test-browser-data", + debugging_port=9222, + headless=False + ) + print(f"CDP URL: {cdp_url}") + + asyncio.run(test_standalone_browser()) \ No newline at end of file diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py index 83857de..cf95671 100644 --- a/tests/docker/test_docker.py +++ b/tests/docker/test_docker.py @@ -7,7 +7,7 @@ from crawl4ai import ( BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator, PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode ) -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.docker_client import Crawl4aiDockerClient class Crawl4AiTester: diff --git a/tests/docker/test_serialization.py b/tests/docker/test_serialization.py index 40df96d..6ce8000 100644 --- a/tests/docker/test_serialization.py +++ b/tests/docker/test_serialization.py @@ -2,7 +2,7 @@ import inspect from typing import Any, Dict from enum import Enum -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig def to_serializable_dict(obj: Any) -> Dict: """ diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py index 9bdb457..b845319 100644 --- a/tests/test_web_crawler.py +++ b/tests/test_web_crawler.py @@ -1,5 +1,5 @@ import unittest, os -from crawl4ai.types import LLMConfig +from crawl4ai import LLMConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import ( RegexChunking,