Fix LLMs.txt cache bug with subdomains and add bypass option (#1557)

* Fix LLMs.txt cache bug with subdomains and add bypass option (#1519)

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Nick:

* Update LLMs.txt test file to use helper functions and concurrent tests

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Remove LLMs.txt test file as requested

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Change parameter name to 'cache' and keep 7-day expiration

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Update generate-llmstxt-supabase.ts

* Update JS and Python SDKs to include cache parameter

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Fix LLMs.txt cache implementation to use normalizeUrl and exact matching

Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev>

* Revert "Fix LLMs.txt cache implementation to use normalizeUrl and exact matching"

This reverts commit d05b9964677b7b2384453329d2ac99d841467053.

* Nick:

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello+firecrawl@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-05-16 16:29:09 -03:00 committed by GitHub
parent ab30c8e4ac
commit 7ccbbec488
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 47 additions and 9 deletions

View File

@ -104,7 +104,8 @@ content-type: application/json
{
"url": "https://firecrawl.dev",
"maxUrls": 1,
"showFullText": false
"showFullText": false,
"cache": true
}

View File

@ -42,6 +42,7 @@ export async function generateLLMsTextController(
url: req.body.url,
maxUrls: req.body.maxUrls,
showFullText: req.body.showFullText,
cache: req.body.cache,
generatedText: "",
fullText: "",
});

View File

@ -1211,6 +1211,10 @@ export const generateLLMsTextRequestSchema = z.object({
.boolean()
.default(false)
.describe("Whether to show the full LLMs-full.txt in the response"),
cache: z
.boolean()
.default(true)
.describe("Whether to use cached content if available"),
__experimental_stream: z.boolean().optional(),
});

View File

@ -36,6 +36,18 @@ describe("normalizeUrlOnlyHostname", () => {
const expected = "not a valid url";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it("should handle URLs with subdomains", () => {
const url = "https://blog.example.com";
const expected = "blog.example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
it("should handle URLs with multiple subdomains", () => {
const url = "https://dev.blog.example.com";
const expected = "dev.blog.example.com";
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
});
});
describe("normalizeUrl", () => {

View File

@ -9,6 +9,7 @@ export interface GenerationData {
url: string;
maxUrls: number;
showFullText: boolean;
cache?: boolean;
generatedText: string;
fullText: string;
error?: string;
@ -66,4 +67,4 @@ export async function updateGeneratedLlmsTxtStatus(
if (error !== undefined) updates.error = error;
await updateGeneratedLlmsTxt(id, updates);
}
}

View File

@ -19,6 +19,7 @@ interface GenerateLLMsTextServiceOptions {
url: string;
maxUrls: number;
showFullText: boolean;
cache?: boolean;
subId?: string;
}
@ -63,7 +64,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
export async function performGenerateLlmsTxt(
options: GenerateLLMsTextServiceOptions,
) {
const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =
const { generationId, teamId, url, maxUrls = 100, showFullText, cache = true, subId } =
options;
const startTime = Date.now();
const logger = _logger.child({
@ -79,8 +80,8 @@ export async function performGenerateLlmsTxt(
// Enforce max URL limit
const effectiveMaxUrls = Math.min(maxUrls, 5000);
// Check cache first
const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
// Check cache first, unless cache is set to false
const cachedResult = cache ? await getLlmsTextFromCache(url, effectiveMaxUrls) : null;
if (cachedResult) {
logger.info("Found cached LLMs text", { url });

View File

@ -1,6 +1,6 @@
import { supabase_service } from "../../services/supabase";
import { logger } from "../logger";
import { normalizeUrlOnlyHostname } from "../canonical-url";
import { normalizeUrl, normalizeUrlOnlyHostname } from "../canonical-url";
interface LlmsTextCache {
origin_url: string;
@ -41,7 +41,7 @@ export async function getLlmsTextFromCache(
return null;
}
return data;
return data
} catch (error) {
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
return null;

View File

@ -653,6 +653,7 @@ const processGenerateLlmsTxtJobInternal = async (
maxUrls: job.data.request.maxUrls,
showFullText: job.data.request.showFullText,
subId: job.data.subId,
cache: job.data.request.cache,
});
if (result.success) {

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.24.0",
"version": "1.25.0",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -520,6 +520,11 @@ export interface GenerateLLMsTextParams {
* @default false
*/
showFullText?: boolean;
/**
* Whether to use cached content if available
* @default true
*/
cache?: boolean;
/**
* Experimental flag for streaming
*/

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
__version__ = "2.5.4"
__version__ = "2.6.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
"""
maxUrls: Optional[int] = 10
showFullText: Optional[bool] = False
cache: Optional[bool] = True
__experimental_stream: Optional[bool] = None
class DeepResearchParams(pydantic.BaseModel):
@ -1870,6 +1871,7 @@ class FirecrawlApp:
*,
max_urls: Optional[int] = None,
show_full_text: Optional[bool] = None,
cache: Optional[bool] = None,
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
"""
Generate LLMs.txt for a given URL and poll until completion.
@ -1878,6 +1880,7 @@ class FirecrawlApp:
url (str): Target URL to generate LLMs.txt from
max_urls (Optional[int]): Maximum URLs to process (default: 10)
show_full_text (Optional[bool]): Include full text in output (default: False)
cache (Optional[bool]): Whether to use cached content if available (default: True)
experimental_stream (Optional[bool]): Enable experimental streaming
Returns:
@ -1893,6 +1896,7 @@ class FirecrawlApp:
params = GenerateLLMsTextParams(
maxUrls=max_urls,
showFullText=show_full_text,
cache=cache,
__experimental_stream=experimental_stream
)
@ -1900,6 +1904,7 @@ class FirecrawlApp:
url,
max_urls=max_urls,
show_full_text=show_full_text,
cache=cache,
experimental_stream=experimental_stream
)
@ -1935,6 +1940,7 @@ class FirecrawlApp:
*,
max_urls: Optional[int] = None,
show_full_text: Optional[bool] = None,
cache: Optional[bool] = None,
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
"""
Initiate an asynchronous LLMs.txt generation operation.
@ -1943,6 +1949,7 @@ class FirecrawlApp:
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
max_urls (Optional[int]): Maximum URLs to process (default: 10)
show_full_text (Optional[bool]): Include full text in output (default: False)
cache (Optional[bool]): Whether to use cached content if available (default: True)
experimental_stream (Optional[bool]): Enable experimental streaming
Returns:
@ -1957,6 +1964,7 @@ class FirecrawlApp:
params = GenerateLLMsTextParams(
maxUrls=max_urls,
showFullText=show_full_text,
cache=cache,
__experimental_stream=experimental_stream
)
@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
url,
max_urls=max_urls,
show_full_text=show_full_text,
cache=cache,
experimental_stream=experimental_stream
)
if not response.get('success') or 'id' not in response:
@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
*,
max_urls: Optional[int] = None,
show_full_text: Optional[bool] = None,
cache: Optional[bool] = None,
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
"""
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
url (str): Target URL to generate LLMs.txt from
max_urls (Optional[int]): Maximum URLs to process (default: 10)
show_full_text (Optional[bool]): Include full text in output (default: False)
cache (Optional[bool]): Whether to use cached content if available (default: True)
experimental_stream (Optional[bool]): Enable experimental streaming
Returns:
@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
params = GenerateLLMsTextParams(
maxUrls=max_urls,
showFullText=show_full_text,
cache=cache,
__experimental_stream=experimental_stream
)