crawl4ai/tests/general/generate_dummy_site.py
UncleCode 230f22da86 refactor(proxy): move ProxyConfig to async_configs and improve LLM token handling
Moved ProxyConfig class from proxy_strategy.py to async_configs.py for better organization.
Improved LLM token handling with new PROVIDER_MODELS_PREFIXES.
Added test cases for deep crawling and proxy rotation.
Removed docker_config from BrowserConfig as it's handled separately.

BREAKING CHANGE: ProxyConfig import path changed from crawl4ai.proxy_strategy to crawl4ai
2025-04-15 22:27:18 +08:00

335 lines
13 KiB
Python

# ==== File: build_dummy_site.py ====
import os
import random
import argparse
from pathlib import Path
from urllib.parse import quote
# --- Configuration ---
NUM_CATEGORIES = 3
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
# --- Helper Functions ---
def generate_lorem(words=20):
"""Generates simple placeholder text."""
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
"""Creates an HTML file with basic structure and inline CSS."""
os.makedirs(filepath.parent, exist_ok=True)
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
breadcrumb_html = ""
if breadcrumbs:
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
# Basic CSS for structure identification (kept the same)
css = """
<style>
body {
font-family: sans-serif;
padding: 20px;
background-color: #1e1e1e;
color: #d1d1d1;
}
.container {
max-width: 960px;
margin: auto;
background: #2c2c2c;
padding: 20px;
border-radius: 5px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
}
h1, h2 {
color: #ccc;
}
a {
color: #9bcdff;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
ul {
list-style: none;
padding-left: 0;
}
li {
margin-bottom: 10px;
}
.category-link,
.subcategory-link,
.product-link,
.details-link,
.reviews-link {
display: block;
padding: 8px;
background-color: #3a3a3a;
border-radius: 3px;
}
.product-preview {
border: 1px solid #444;
padding: 10px;
margin-bottom: 10px;
border-radius: 4px;
background-color: #2a2a2a;
}
.product-title {
color: #d1d1d1;
}
.product-price {
font-weight: bold;
color: #85e085;
}
.product-description,
.product-specs,
.product-reviews {
margin-top: 15px;
line-height: 1.6;
}
.product-specs li {
margin-bottom: 5px;
font-size: 0.9em;
}
.spec-name {
font-weight: bold;
}
.breadcrumbs {
margin-bottom: 20px;
font-size: 0.9em;
color: #888;
}
.breadcrumbs a {
color: #9bcdff;
}
</style>
"""
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title} - FakeShop</title>
{head_extras}
{css}
</head>
<body>
<div class="container">
{breadcrumb_html}
<h1>{title}</h1>
{body_content}
</div>
</body>
</html>"""
with open(filepath, "w", encoding="utf-8") as f:
f.write(html_content)
# Keep print statement concise for clarity
# print(f"Created: {filepath}")
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
"""Generates the dummy website structure."""
base_dir.mkdir(parents=True, exist_ok=True)
# --- Clean and prepare the base path for URL construction ---
# Ensure it starts with '/' if not empty, and remove any trailing '/'
if base_path:
full_base_path = "/" + base_path.strip('/')
else:
full_base_path = "" # Represents the root
print(f"Using base path for links: '{full_base_path}'")
# --- Level 0: Homepage ---
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
# Define the *actual* link path for the homepage breadcrumb
home_link_path = f"{full_base_path}/index.html"
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
# Links *within* the page content should remain relative
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
# This path is relative to the current directory (index.html)
cat_relative_page_path = f"{cat_folder_name}/index.html"
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
home_body += "</ul>"
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
# --- Levels 1-5 ---
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
cat_dir = base_dir / cat_folder_name
# This is the *absolute* path for the breadcrumb link
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
# --- Level 1: Category Page ---
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
# Path relative to the category page
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
cat_body += "</ul>"
# Pass the updated breadcrumbs list
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
subcat_dir = cat_dir / subcat_folder_name
# Absolute path for the breadcrumb link
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
# --- Level 2: Sub-Category Page (Product List) ---
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
for k in range(NUM_PRODUCTS_PER_SUBCAT):
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
# Filename relative to the subcategory page
prod_filename = f"product_{prod_id}.html"
# Absolute path for the breadcrumb link
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
# Preview on list page (link remains relative)
subcat_body += f"""
<li>
<div class="product-preview">
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
<p>{generate_lorem(10)}</p>
<span class="product-price"{random.uniform(10, 500):.2f}</span>
</div>
</li>"""
# --- Level 3: Product Page ---
prod_price = random.uniform(10, 500)
prod_desc = generate_lorem(40)
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
prod_reviews_count = random.randint(0, 150)
# Relative filenames for links on this page
details_filename_relative = f"product_{prod_id}_details.html"
reviews_filename_relative = f"product_{prod_id}_reviews.html"
prod_body = f"""
<p class="product-price">Price: £{prod_price:.2f}</p>
<div class="product-description">
<h2>Description</h2>
<p>{prod_desc}</p>
</div>
<div class="product-specs">
<h2>Specifications</h2>
<ul>
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
</ul>
</div>
<div class="product-reviews">
<h2>Reviews</h2>
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
</div>
<hr>
<p>
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
</p>
"""
# Update breadcrumbs list for this level
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
# --- Level 4: Product Details Page ---
details_filename = f"product_{prod_id}_details.html" # Actual filename
# Absolute path for the breadcrumb link
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
# Update breadcrumbs list for this level
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
# --- Level 5: Product Reviews Page ---
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
# Absolute path for the breadcrumb link
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
for r in range(prod_reviews_count):
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
reviews_body += "</ul>"
# Update breadcrumbs list for this level
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
subcat_body += "</ul>" # Close product-list ul
# Pass the correct breadcrumbs list for the subcategory index page
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
# --- Main Execution ---
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
parser.add_argument(
"-o", "--output-dir",
type=str,
default="dummy_retail_site",
help="Directory to generate the website in."
)
parser.add_argument(
"-n", "--site-name",
type=str,
default="FakeShop",
help="Name of the fake shop."
)
parser.add_argument(
"-b", "--base-path",
type=str,
default="",
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
)
# Optional: Add more args to configure counts if needed
args = parser.parse_args()
output_directory = Path(args.output_dir)
site_name = args.site_name
base_path = args.base_path
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
# Pass the base_path to the generation function
generate_site(output_directory, site_name, base_path)
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
print("Dummy site generation complete.")
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
if base_path:
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
else:
print(f"Access the site at: http://localhost:8000/index.html")