Fix LLMs.txt cache bug with subdomains and add bypass option (#1557)

* Fix LLMs.txt cache bug with subdomains and add bypass option (#1519) Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Nick: * Update LLMs.txt test file to use helper functions and concurrent tests Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Remove LLMs.txt test file as requested Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Change parameter name to 'cache' and keep 7-day expiration Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Update generate-llmstxt-supabase.ts * Update JS and Python SDKs to include cache parameter Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Fix LLMs.txt cache implementation to use normalizeUrl and exact matching Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Revert "Fix LLMs.txt cache implementation to use normalizeUrl and exact matching" This reverts commit d05b9964677b7b2384453329d2ac99d841467053. * Nick: --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello+firecrawl@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2025-06-27 00:41:33 +00:00 · 2025-05-16 16:29:09 -03:00 · 2025-05-16 16:29:09 -03:00 · 7ccbbec488
commit 7ccbbec488
parent ab30c8e4ac
12 changed files with 47 additions and 9 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -104,7 +104,8 @@ content-type: application/json
 {
  "url": "https://firecrawl.dev",
  "maxUrls": 1,
-  "showFullText": false
+  "showFullText": false,
+  "cache": true
 }


--- a/apps/api/src/controllers/v1/generate-llmstxt.ts
+++ b/apps/api/src/controllers/v1/generate-llmstxt.ts
@ -42,6 +42,7 @@ export async function generateLLMsTextController(
    url: req.body.url,
    maxUrls: req.body.maxUrls,
    showFullText: req.body.showFullText,
+    cache: req.body.cache,
    generatedText: "",
    fullText: "",
  });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -1211,6 +1211,10 @@ export const generateLLMsTextRequestSchema = z.object({
    .boolean()
    .default(false)
    .describe("Whether to show the full LLMs-full.txt in the response"),
+  cache: z
+    .boolean()
+    .default(true)
+    .describe("Whether to use cached content if available"),
  __experimental_stream: z.boolean().optional(),
 });

--- a/apps/api/src/lib/canonical-url.test.ts
+++ b/apps/api/src/lib/canonical-url.test.ts
@ -36,6 +36,18 @@ describe("normalizeUrlOnlyHostname", () => {
    const expected = "not a valid url";
    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
  });
+
+  it("should handle URLs with subdomains", () => {
+    const url = "https://blog.example.com";
+    const expected = "blog.example.com";
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
+
+  it("should handle URLs with multiple subdomains", () => {
+    const url = "https://dev.blog.example.com";
+    const expected = "dev.blog.example.com"; 
+    expect(normalizeUrlOnlyHostname(url)).toBe(expected);
+  });
 });

 describe("normalizeUrl", () => {
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-redis.ts
@ -9,6 +9,7 @@ export interface GenerationData {
  url: string;
  maxUrls: number;
  showFullText: boolean;
+  cache?: boolean;
  generatedText: string;
  fullText: string;
  error?: string;
@ -66,4 +67,4 @@ export async function updateGeneratedLlmsTxtStatus(
  if (error !== undefined) updates.error = error;
  
  await updateGeneratedLlmsTxt(id, updates);
-} 
+}    
--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts
@ -19,6 +19,7 @@ interface GenerateLLMsTextServiceOptions {
  url: string;
  maxUrls: number;
  showFullText: boolean;
+  cache?: boolean;
  subId?: string;
 }

@ -63,7 +64,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
 export async function performGenerateLlmsTxt(
  options: GenerateLLMsTextServiceOptions,
 ) {
-  const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =
+  const { generationId, teamId, url, maxUrls = 100, showFullText, cache = true, subId } =
    options;
  const startTime = Date.now();
  const logger = _logger.child({
@ -79,8 +80,8 @@ export async function performGenerateLlmsTxt(
    // Enforce max URL limit
    const effectiveMaxUrls = Math.min(maxUrls, 5000);

-    // Check cache first
-    const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
+    // Check cache first, unless cache is set to false
+    const cachedResult = cache ? await getLlmsTextFromCache(url, effectiveMaxUrls) : null;
    if (cachedResult) {
      logger.info("Found cached LLMs text", { url });

--- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
+++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-supabase.ts
@ -1,6 +1,6 @@
 import { supabase_service } from "../../services/supabase";
 import { logger } from "../logger";
-import { normalizeUrlOnlyHostname } from "../canonical-url";
+import { normalizeUrl, normalizeUrlOnlyHostname } from "../canonical-url";

 interface LlmsTextCache {
  origin_url: string;
@ -41,7 +41,7 @@ export async function getLlmsTextFromCache(
      return null;
    }

-    return data;
+    return data
  } catch (error) {
    logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
    return null;
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -653,6 +653,7 @@ const processGenerateLlmsTxtJobInternal = async (
      maxUrls: job.data.request.maxUrls,
      showFullText: job.data.request.showFullText,
      subId: job.data.subId,
+      cache: job.data.request.cache,
    });

    if (result.success) {
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.24.0",
+  "version": "1.25.0",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -520,6 +520,11 @@ export interface GenerateLLMsTextParams {
   * @default false
   */
  showFullText?: boolean;
+  /**
+   * Whether to use cached content if available
+   * @default true
+   */
+  cache?: boolean;
  /**
   * Experimental flag for streaming
   */
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa

-__version__ = "2.5.4"
+__version__ = "2.6.0"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
    """
    maxUrls: Optional[int] = 10
    showFullText: Optional[bool] = False
+    cache: Optional[bool] = True
    __experimental_stream: Optional[bool] = None

 class DeepResearchParams(pydantic.BaseModel):
@ -1870,6 +1871,7 @@ class FirecrawlApp:
            *,
            max_urls: Optional[int] = None,
            show_full_text: Optional[bool] = None,
+            cache: Optional[bool] = None,
            experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
        """
        Generate LLMs.txt for a given URL and poll until completion.
@ -1878,6 +1880,7 @@ class FirecrawlApp:
            url (str): Target URL to generate LLMs.txt from
            max_urls (Optional[int]): Maximum URLs to process (default: 10)
            show_full_text (Optional[bool]): Include full text in output (default: False)
+            cache (Optional[bool]): Whether to use cached content if available (default: True)
            experimental_stream (Optional[bool]): Enable experimental streaming

        Returns:
@ -1893,6 +1896,7 @@ class FirecrawlApp:
        params = GenerateLLMsTextParams(
            maxUrls=max_urls,
            showFullText=show_full_text,
+            cache=cache,
            __experimental_stream=experimental_stream
        )

@ -1900,6 +1904,7 @@ class FirecrawlApp:
            url,
            max_urls=max_urls,
            show_full_text=show_full_text,
+            cache=cache,
            experimental_stream=experimental_stream
        )
        
@ -1935,6 +1940,7 @@ class FirecrawlApp:
            *,
            max_urls: Optional[int] = None,
            show_full_text: Optional[bool] = None,
+            cache: Optional[bool] = None,
            experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
        """
        Initiate an asynchronous LLMs.txt generation operation.
@ -1943,6 +1949,7 @@ class FirecrawlApp:
            url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
            max_urls (Optional[int]): Maximum URLs to process (default: 10)
            show_full_text (Optional[bool]): Include full text in output (default: False)
+            cache (Optional[bool]): Whether to use cached content if available (default: True)
            experimental_stream (Optional[bool]): Enable experimental streaming

        Returns:
@ -1957,6 +1964,7 @@ class FirecrawlApp:
        params = GenerateLLMsTextParams(
            maxUrls=max_urls,
            showFullText=show_full_text,
+            cache=cache,
            __experimental_stream=experimental_stream
        )

@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
            url,
            max_urls=max_urls,
            show_full_text=show_full_text,
+            cache=cache,
            experimental_stream=experimental_stream
        )
        if not response.get('success') or 'id' not in response:
@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
            *,
            max_urls: Optional[int] = None,
            show_full_text: Optional[bool] = None,
+            cache: Optional[bool] = None,
            experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
        """
        Initiate an asynchronous LLMs.txt generation job without waiting for completion.
@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
            url (str): Target URL to generate LLMs.txt from
            max_urls (Optional[int]): Maximum URLs to process (default: 10)
            show_full_text (Optional[bool]): Include full text in output (default: False)
+            cache (Optional[bool]): Whether to use cached content if available (default: True)
            experimental_stream (Optional[bool]): Enable experimental streaming

        Returns:
@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
        params = GenerateLLMsTextParams(
            maxUrls=max_urls,
            showFullText=show_full_text,
+            cache=cache,
            __experimental_stream=experimental_stream
        )