Fixed capturing console messages for case the url is the local file. Update docker configuration (work in progress)

2025-04-10 23:22:38 +08:00 · 2025-04-10 23:22:38 +08:00 · 108b2a8bfb
commit 108b2a8bfb
parent 66ac07b4f3
9 changed files with 898 additions and 664 deletions
--- a/33
+++ b/33
@ -24,7 +24,7 @@ ARG TARGETARCH

 LABEL maintainer="unclecode"
 LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"    
+LABEL version="1.0"

 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libjpeg-dev \
    redis-server \
    supervisor \
+    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*

 RUN apt-get update && apt-get install -y --no-install-recommends \
@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libcairo2 \
    libasound2 \
    libatspi2.0-0 \
+    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*

 RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
    apt-get update && apt-get install -y --no-install-recommends \
    nvidia-cuda-toolkit \
+    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/* ; \
 else \
    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    echo "🦾 Installing ARM-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libopenblas-dev \
+    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 elif [ "$TARGETARCH" = "amd64" ]; then \
    echo "🖥️ Installing AMD64-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libomp-dev \
+    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 else \
    echo "Skipping platform-specific optimizations (unsupported platform)"; \
 fi

+# Create a non-root user and group
+RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
+
+# Create and set permissions for appuser home directory
+RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
+
 WORKDIR ${APP_HOME}

 RUN echo '#!/bin/bash\n\
@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh

 COPY . /tmp/project/

+# Copy supervisor config first (might need root later, but okay for now)
 COPY deploy/docker/supervisord.conf .

 COPY deploy/docker/requirements.txt .
@ -131,16 +143,23 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
    else \
        pip install "/tmp/project" ; \
    fi
-    
+
 RUN pip install --no-cache-dir --upgrade pip && \
    /tmp/install.sh && \
    python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
    python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
-    
+
 RUN playwright install --with-deps chromium

+# Copy application code
 COPY deploy/docker/* ${APP_HOME}/

+# Change ownership of the application directory to the non-root user
+RUN chown -R appuser:appuser ${APP_HOME}
+
+# give permissions to redis persistence dirs if used
+RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
+
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD bash -c '\
    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
        exit 1; \
    fi && \
    redis-cli ping > /dev/null && \
-    curl -f http://localhost:8000/health || exit 1'
+    curl -f http://localhost:11235/health || exit 1'

 EXPOSE 6379
-CMD ["supervisord", "-c", "supervisord.conf"]
-    
+# Switch to the non-root user before starting the application
+USER appuser
+
+CMD ["supervisord", "-c", "supervisord.conf"]
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

        user_agent = kwargs.get("user_agent", self.user_agent)
        # Use browser_manager to get a fresh page & context assigned to this session_id
-        page, context = await self.browser_manager.get_page(session_id, user_agent)
+        page, context = await self.browser_manager.get_page(CrawlerRunConfig(
+            session_id=session_id,
+            user_agent=user_agent,
+            **kwargs,
+        ))
        return session_id

    async def crawl(
@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                html = f.read()
            if config.screenshot:
                screenshot_data = await self._generate_screenshot_from_html(html)
+            if config.capture_console_messages:
+                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+                captured_console = await self._capture_console_messages(page, url)
+
            return AsyncCrawlResponse(
                html=html,
                response_headers=response_headers,
                status_code=status_code,
                screenshot=screenshot_data,
                get_delayed_content=None,
+                console_messages=captured_console,
            )

        elif url.startswith("raw:") or url.startswith("raw://"):
@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "url": request.url,
                        "method": request.method,
                        "resource_type": request.resource_type,
-                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "failure_text": str(request.failure) if request.failure else "Unknown failure",
                        "timestamp": time.time()
                    })
                 except Exception as e:
@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                )
            return None

+    async def _capture_console_messages(
+        self, page: Page, file_path: str
+    ) -> List[Dict[str, Union[str, float]]]:
+        """
+        Captures console messages from the page.
+        Args:
+
+            page (Page): The Playwright page object
+        Returns:
+            List[Dict[str, Union[str, float]]]: A list of captured console messages
+        """
+        captured_console = []
+
+        def handle_console_message(msg):
+            try:
+                message_type = msg.type
+                message_text = msg.text
+
+                entry = {
+                    "type": message_type,
+                    "text": message_text,
+                    "timestamp": time.time(),
+                }
+                captured_console.append(entry)
+            except Exception as e:
+                if self.logger:
+                    self.logger.warning(
+                        f"Error capturing console message: {e}", tag="CAPTURE"
+                    )
+
+        page.on("console", handle_console_message)
+        
+        await page.goto(file_path)
+
+        return captured_console
+        
    async def take_screenshot(self, page, **kwargs) -> str:
        """
        Take a screenshot of the current page.
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@ -658,7 +658,7 @@ class BrowserManager:
                    "name": "cookiesEnabled",
                    "value": "true",
                    "url": crawlerRunConfig.url
-                    if crawlerRunConfig
+                    if crawlerRunConfig and crawlerRunConfig.url
                    else "https://crawl4ai.com/",
                }
            ]
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@ -1,4 +1,3 @@
-crawl4ai
 fastapi
 uvicorn
 gunicorn>=23.0.0
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@ -1,12 +1,28 @@
 [supervisord]
-nodaemon=true
+nodaemon=true                   ; Run supervisord in the foreground
+logfile=/dev/null               ; Log supervisord output to stdout/stderr
+logfile_maxbytes=0

 [program:redis]
-command=redis-server
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
+user=appuser                    ; Run redis as our non-root user
 autorestart=true
 priority=10
+stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
+stderr_logfile_maxbytes=0

 [program:gunicorn]
-command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
+directory=/app                  ; Working directory for the app
+user=appuser                    ; Run gunicorn as our non-root user
 autorestart=true
-priority=20
+priority=20
+environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
+stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
+stderr_logfile_maxbytes=0
+
+# Optional: Add filebeat or other logging agents here if needed
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,15 +1,31 @@
-# Base configuration (not a service, just a reusable config block)
+# docker-compose.yml
+# This file is in the root directory alongside Dockerfile
+
+# Base configuration anchor for reusability
 x-base-config: &base-config
  ports:
+    # Map host port 11235 to container port 11235 (where Gunicorn will listen)
    - "11235:11235"
-    - "8000:8000"
-    - "9222:9222"
-    - "8080:8080"
+    # - "8080:8080" # Uncomment if needed
+
+  # Load API keys primarily from .llm.env file
+  # Create .llm.env in the root directory from deploy/docker/.llm.env.example
+  env_file:
+    - .llm.env
+
+  # Define environment variables, allowing overrides from host environment
+  # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
  environment:
-    - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+    - GROQ_API_KEY=${GROQ_API_KEY:-}
+    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
+
  volumes:
+    # Mount /dev/shm for Chromium/Playwright performance
    - /dev/shm:/dev/shm
  deploy:
    resources:
@ -19,47 +35,47 @@ x-base-config: &base-config
        memory: 1G
  restart: unless-stopped
  healthcheck:
+    # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
    test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
    interval: 30s
    timeout: 10s
    retries: 3
-    start_period: 40s
+    start_period: 40s # Give the server time to start
+  # Run the container as the non-root user defined in the Dockerfile
+  user: "appuser"

 services:
-  # Local build services for different platforms
-  crawl4ai-amd64:
+  # --- Local Build Services ---
+  crawl4ai-local-amd64:
    build:
-      context: .
-      dockerfile: Dockerfile
+      context: . # Build context is the root directory
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/amd64
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
+        # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
+    platform: linux/amd64
    profiles: ["local-amd64"]
-    <<: *base-config  # extends yerine doğrudan yapılandırmayı dahil ettik
+    <<: *base-config # Inherit base configuration

-  crawl4ai-arm64:
+  crawl4ai-local-arm64:
    build:
-      context: .
-      dockerfile: Dockerfile
+      context: . # Build context is the root directory
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/arm64
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
+    platform: linux/arm64
    profiles: ["local-arm64"]
    <<: *base-config

-  # Hub services for different platforms and versions
+  # --- Docker Hub Image Services ---
  crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+    image: unclecode/crawl4ai:${VERSION:-latest}-amd64
    profiles: ["hub-amd64"]
    <<: *base-config

  crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+    image: unclecode/crawl4ai:${VERSION:-latest}-arm64
    profiles: ["hub-arm64"]
    <<: *base-config
--- a/docs/examples/network_console_capture_example.py
+++ b/docs/examples/network_console_capture_example.py
@ -357,8 +357,7 @@ async def demo_performance_analysis():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            capture_network_requests=True,
-            wait_until="networkidle",
-            page_timeout=60000  # 60 seconds
+            page_timeout=60 * 2 * 1000  # 120 seconds
        )
        
        result = await crawler.arun(
@ -406,6 +405,13 @@ async def demo_performance_analysis():
                            "url": url,
                            "duration_ms": duration
                        })
+                    if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
+                        # Convert to milliseconds
+                        duration = (timing["responseStart"] - timing["requestStart"]) * 1000
+                        resource_timings[resource_type].append({
+                            "url": url,
+                            "duration_ms": duration
+                        })
                
                # Calculate statistics for each resource type
                print("\nPerformance by resource type:")
@ -455,14 +461,14 @@ async def main():
    os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
    
    # Run basic examples
-    await demo_basic_network_capture()
+    # await demo_basic_network_capture()
    await demo_basic_console_capture()
-    await demo_combined_capture()
+    # await demo_combined_capture()
    
    # Run advanced examples
-    await analyze_spa_network_traffic()
-    await demo_security_analysis()
-    await demo_performance_analysis()
+    # await analyze_spa_network_traffic()
+    # await demo_security_analysis()
+    # await demo_performance_analysis()
    
    print("\n=== Examples Complete ===")
    print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
--- a/docs/tutorials/coming_soon.md
+++ b/docs/tutorials/coming_soon.md