autogen/python/packages/autogen-studio/autogenstudio/gallery/tools/google_search.py

import os
from typing import Dict, List, Optional
from urllib.parse import urljoin

import html2text
import httpx
from autogen_core.code_executor import ImportFromModule
from autogen_core.tools import FunctionTool
from bs4 import BeautifulSoup


async def google_search(
    query: str,
    num_results: int = 5,
    include_snippets: bool = True,
    include_content: bool = True,
    content_max_length: Optional[int] = 15000,
    language: str = "en",
    country: Optional[str] = None,
    safe_search: bool = True,
) -> List[Dict[str, str]]:
    """
    Perform a Google search using the Custom Search API and optionally fetch webpage content.

    Args:
        query: Search query string
        num_results: Number of results to return (max 10)
        include_snippets: Include result snippets in output
        include_content: Include full webpage content in markdown format
        content_max_length: Maximum length of webpage content (if included)
        language: Language code for search results (e.g., en, es, fr)
        country: Optional country code for search results (e.g., us, uk)
        safe_search: Enable safe search filtering

    Returns:
        List[Dict[str, str]]: List of search results, each containing:
            - title: Result title
            - link: Result URL
            - snippet: Result description (if include_snippets=True)
            - content: Webpage content in markdown (if include_content=True)
    """
    api_key = os.getenv("GOOGLE_API_KEY")
    cse_id = os.getenv("GOOGLE_CSE_ID")

    if not api_key or not cse_id:
        raise ValueError("Missing required environment variables. Please set GOOGLE_API_KEY and GOOGLE_CSE_ID.")

    num_results = min(max(1, num_results), 10)

    async def fetch_page_content(url: str, max_length: Optional[int] = 50000) -> str:
        """Helper function to fetch and convert webpage content to markdown"""
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

        try:
            async with httpx.AsyncClient() as client:
                response = await client.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, "html.parser")

                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()

                # Convert relative URLs to absolute
                for tag in soup.find_all(["a", "img"]):
                    if tag.get("href"):
                        tag["href"] = urljoin(url, tag["href"])
                    if tag.get("src"):
                        tag["src"] = urljoin(url, tag["src"])

                h2t = html2text.HTML2Text()
                h2t.body_width = 0
                h2t.ignore_images = False
                h2t.ignore_emphasis = False
                h2t.ignore_links = False
                h2t.ignore_tables = False

                markdown = h2t.handle(str(soup))

                if max_length and len(markdown) > max_length:
                    markdown = markdown[:max_length] + "\n...(truncated)"

                return markdown.strip()

        except Exception as e:
            return f"Error fetching content: {str(e)}"

    params = {
        "key": api_key,
        "cx": cse_id,
        "q": query,
        "num": num_results,
        "hl": language,
        "safe": "active" if safe_search else "off",
    }

    if country:
        params["gl"] = country

    try:
        async with httpx.AsyncClient() as client:
            response = await client.get("https://www.googleapis.com/customsearch/v1", params=params, timeout=10)
            response.raise_for_status()
            data = response.json()

            results = []
            if "items" in data:
                for item in data["items"]:
                    result = {"title": item.get("title", ""), "link": item.get("link", "")}
                    if include_snippets:
                        result["snippet"] = item.get("snippet", "")

                    if include_content:
                        result["content"] = await fetch_page_content(result["link"], max_length=content_max_length)

                    results.append(result)

            return results

    except httpx.RequestError as e:
        raise ValueError(f"Failed to perform search: {str(e)}") from e
    except KeyError as e:
        raise ValueError(f"Invalid API response format: {str(e)}") from e
    except Exception as e:
        raise ValueError(f"Error during search: {str(e)}") from e


# Create the enhanced Google search tool
google_search_tool = FunctionTool(
    func=google_search,
    description="""
    Perform Google searches using the Custom Search API with optional webpage content fetching.
    Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables to be set.
    """,
    global_imports=[
        ImportFromModule("typing", ("List", "Dict", "Optional")),
        "os",
        "httpx",
        "html2text",
        ImportFromModule("bs4", ("BeautifulSoup",)),
        ImportFromModule("urllib.parse", ("urljoin",)),
    ],
)
Fix warnings in AGS (#5320) This PR does the following: - Fix warning messages in AGS on launch. - Improve Cli message to include app URL on startup from command line - Minor improvements default gallery generator. (add more default tools) - Improve new session behaviour. ## Related issue number Closes #5097 ## Checks 2025-02-03 22:32:34 -08:00			`import os`
			`from typing import Dict, List, Optional`
			`from urllib.parse import urljoin`

			`import html2text`
			`import httpx`
			`from autogen_core.code_executor import ImportFromModule`
			`from autogen_core.tools import FunctionTool`
			`from bs4 import BeautifulSoup`


			`async def google_search(`
			`query: str,`
			`num_results: int = 5,`
			`include_snippets: bool = True,`
			`include_content: bool = True,`
			`content_max_length: Optional[int] = 15000,`
			`language: str = "en",`
			`country: Optional[str] = None,`
			`safe_search: bool = True,`
			`) -> List[Dict[str, str]]:`
			`"""`
			`Perform a Google search using the Custom Search API and optionally fetch webpage content.`

			`Args:`
			`query: Search query string`
			`num_results: Number of results to return (max 10)`
			`include_snippets: Include result snippets in output`
			`include_content: Include full webpage content in markdown format`
			`content_max_length: Maximum length of webpage content (if included)`
			`language: Language code for search results (e.g., en, es, fr)`
			`country: Optional country code for search results (e.g., us, uk)`
			`safe_search: Enable safe search filtering`

			`Returns:`
			`List[Dict[str, str]]: List of search results, each containing:`
			`- title: Result title`
			`- link: Result URL`
			`- snippet: Result description (if include_snippets=True)`
			`- content: Webpage content in markdown (if include_content=True)`
			`"""`
			`api_key = os.getenv("GOOGLE_API_KEY")`
			`cse_id = os.getenv("GOOGLE_CSE_ID")`

			`if not api_key or not cse_id:`
			`raise ValueError("Missing required environment variables. Please set GOOGLE_API_KEY and GOOGLE_CSE_ID.")`

			`num_results = min(max(1, num_results), 10)`

			`async def fetch_page_content(url: str, max_length: Optional[int] = 50000) -> str:`
			`"""Helper function to fetch and convert webpage content to markdown"""`
			`headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}`

			`try:`
			`async with httpx.AsyncClient() as client:`
			`response = await client.get(url, headers=headers, timeout=10)`
			`response.raise_for_status()`

			`soup = BeautifulSoup(response.text, "html.parser")`

			`# Remove script and style elements`
			`for script in soup(["script", "style"]):`
			`script.decompose()`

			`# Convert relative URLs to absolute`
			`for tag in soup.find_all(["a", "img"]):`
			`if tag.get("href"):`
			`tag["href"] = urljoin(url, tag["href"])`
			`if tag.get("src"):`
			`tag["src"] = urljoin(url, tag["src"])`

			`h2t = html2text.HTML2Text()`
			`h2t.body_width = 0`
			`h2t.ignore_images = False`
			`h2t.ignore_emphasis = False`
			`h2t.ignore_links = False`
			`h2t.ignore_tables = False`

			`markdown = h2t.handle(str(soup))`

			`if max_length and len(markdown) > max_length:`
			`markdown = markdown[:max_length] + "\n...(truncated)"`

			`return markdown.strip()`

			`except Exception as e:`
			`return f"Error fetching content: {str(e)}"`

			`params = {`
			`"key": api_key,`
			`"cx": cse_id,`
			`"q": query,`
			`"num": num_results,`
			`"hl": language,`
			`"safe": "active" if safe_search else "off",`
			`}`

			`if country:`
			`params["gl"] = country`

			`try:`
			`async with httpx.AsyncClient() as client:`
			`response = await client.get("https://www.googleapis.com/customsearch/v1", params=params, timeout=10)`
			`response.raise_for_status()`
			`data = response.json()`

			`results = []`
			`if "items" in data:`
			`for item in data["items"]:`
			`result = {"title": item.get("title", ""), "link": item.get("link", "")}`
			`if include_snippets:`
			`result["snippet"] = item.get("snippet", "")`

			`if include_content:`
			`result["content"] = await fetch_page_content(result["link"], max_length=content_max_length)`

			`results.append(result)`

			`return results`

			`except httpx.RequestError as e:`
			`raise ValueError(f"Failed to perform search: {str(e)}") from e`
			`except KeyError as e:`
			`raise ValueError(f"Invalid API response format: {str(e)}") from e`
			`except Exception as e:`
			`raise ValueError(f"Error during search: {str(e)}") from e`


			`# Create the enhanced Google search tool`
			`google_search_tool = FunctionTool(`
			`func=google_search,`
			`description="""`
			`Perform Google searches using the Custom Search API with optional webpage content fetching.`
			`Requires GOOGLE_API_KEY and GOOGLE_CSE_ID environment variables to be set.`
			`""",`
			`global_imports=[`
			`ImportFromModule("typing", ("List", "Dict", "Optional")),`
			`"os",`
			`"httpx",`
			`"html2text",`
			`ImportFromModule("bs4", ("BeautifulSoup",)),`
			`ImportFromModule("urllib.parse", ("urljoin",)),`
			`],`
			`)`