mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
Fix LLMs.txt cache bug with subdomains and add bypass option (#1557)
* Fix LLMs.txt cache bug with subdomains and add bypass option (#1519) Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Nick: * Update LLMs.txt test file to use helper functions and concurrent tests Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Remove LLMs.txt test file as requested Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Change parameter name to 'cache' and keep 7-day expiration Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Update generate-llmstxt-supabase.ts * Update JS and Python SDKs to include cache parameter Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Fix LLMs.txt cache implementation to use normalizeUrl and exact matching Co-Authored-By: hello@sideguide.dev <hello+firecrawl@sideguide.dev> * Revert "Fix LLMs.txt cache implementation to use normalizeUrl and exact matching" This reverts commit d05b9964677b7b2384453329d2ac99d841467053. * Nick: --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev <hello+firecrawl@sideguide.dev> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
ab30c8e4ac
commit
7ccbbec488
@ -104,7 +104,8 @@ content-type: application/json
|
||||
{
|
||||
"url": "https://firecrawl.dev",
|
||||
"maxUrls": 1,
|
||||
"showFullText": false
|
||||
"showFullText": false,
|
||||
"cache": true
|
||||
}
|
||||
|
||||
|
||||
|
@ -42,6 +42,7 @@ export async function generateLLMsTextController(
|
||||
url: req.body.url,
|
||||
maxUrls: req.body.maxUrls,
|
||||
showFullText: req.body.showFullText,
|
||||
cache: req.body.cache,
|
||||
generatedText: "",
|
||||
fullText: "",
|
||||
});
|
||||
|
@ -1211,6 +1211,10 @@ export const generateLLMsTextRequestSchema = z.object({
|
||||
.boolean()
|
||||
.default(false)
|
||||
.describe("Whether to show the full LLMs-full.txt in the response"),
|
||||
cache: z
|
||||
.boolean()
|
||||
.default(true)
|
||||
.describe("Whether to use cached content if available"),
|
||||
__experimental_stream: z.boolean().optional(),
|
||||
});
|
||||
|
||||
|
@ -36,6 +36,18 @@ describe("normalizeUrlOnlyHostname", () => {
|
||||
const expected = "not a valid url";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains", () => {
|
||||
const url = "https://blog.example.com";
|
||||
const expected = "blog.example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
|
||||
it("should handle URLs with multiple subdomains", () => {
|
||||
const url = "https://dev.blog.example.com";
|
||||
const expected = "dev.blog.example.com";
|
||||
expect(normalizeUrlOnlyHostname(url)).toBe(expected);
|
||||
});
|
||||
});
|
||||
|
||||
describe("normalizeUrl", () => {
|
||||
|
@ -9,6 +9,7 @@ export interface GenerationData {
|
||||
url: string;
|
||||
maxUrls: number;
|
||||
showFullText: boolean;
|
||||
cache?: boolean;
|
||||
generatedText: string;
|
||||
fullText: string;
|
||||
error?: string;
|
||||
@ -66,4 +67,4 @@ export async function updateGeneratedLlmsTxtStatus(
|
||||
if (error !== undefined) updates.error = error;
|
||||
|
||||
await updateGeneratedLlmsTxt(id, updates);
|
||||
}
|
||||
}
|
@ -19,6 +19,7 @@ interface GenerateLLMsTextServiceOptions {
|
||||
url: string;
|
||||
maxUrls: number;
|
||||
showFullText: boolean;
|
||||
cache?: boolean;
|
||||
subId?: string;
|
||||
}
|
||||
|
||||
@ -63,7 +64,7 @@ function limitLlmsTxtEntries(llmstxt: string, maxEntries: number): string {
|
||||
export async function performGenerateLlmsTxt(
|
||||
options: GenerateLLMsTextServiceOptions,
|
||||
) {
|
||||
const { generationId, teamId, url, maxUrls = 100, showFullText, subId } =
|
||||
const { generationId, teamId, url, maxUrls = 100, showFullText, cache = true, subId } =
|
||||
options;
|
||||
const startTime = Date.now();
|
||||
const logger = _logger.child({
|
||||
@ -79,8 +80,8 @@ export async function performGenerateLlmsTxt(
|
||||
// Enforce max URL limit
|
||||
const effectiveMaxUrls = Math.min(maxUrls, 5000);
|
||||
|
||||
// Check cache first
|
||||
const cachedResult = await getLlmsTextFromCache(url, effectiveMaxUrls);
|
||||
// Check cache first, unless cache is set to false
|
||||
const cachedResult = cache ? await getLlmsTextFromCache(url, effectiveMaxUrls) : null;
|
||||
if (cachedResult) {
|
||||
logger.info("Found cached LLMs text", { url });
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { logger } from "../logger";
|
||||
import { normalizeUrlOnlyHostname } from "../canonical-url";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../canonical-url";
|
||||
|
||||
interface LlmsTextCache {
|
||||
origin_url: string;
|
||||
@ -41,7 +41,7 @@ export async function getLlmsTextFromCache(
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
return data
|
||||
} catch (error) {
|
||||
logger.error("Failed to fetch LLMs text from cache", { error, originUrl });
|
||||
return null;
|
||||
|
@ -653,6 +653,7 @@ const processGenerateLlmsTxtJobInternal = async (
|
||||
maxUrls: job.data.request.maxUrls,
|
||||
showFullText: job.data.request.showFullText,
|
||||
subId: job.data.subId,
|
||||
cache: job.data.request.cache,
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.24.0",
|
||||
"version": "1.25.0",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -520,6 +520,11 @@ export interface GenerateLLMsTextParams {
|
||||
* @default false
|
||||
*/
|
||||
showFullText?: boolean;
|
||||
/**
|
||||
* Whether to use cached content if available
|
||||
* @default true
|
||||
*/
|
||||
cache?: boolean;
|
||||
/**
|
||||
* Experimental flag for streaming
|
||||
*/
|
||||
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||
|
||||
__version__ = "2.5.4"
|
||||
__version__ = "2.6.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
||||
"""
|
||||
maxUrls: Optional[int] = 10
|
||||
showFullText: Optional[bool] = False
|
||||
cache: Optional[bool] = True
|
||||
__experimental_stream: Optional[bool] = None
|
||||
|
||||
class DeepResearchParams(pydantic.BaseModel):
|
||||
@ -1870,6 +1871,7 @@ class FirecrawlApp:
|
||||
*,
|
||||
max_urls: Optional[int] = None,
|
||||
show_full_text: Optional[bool] = None,
|
||||
cache: Optional[bool] = None,
|
||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
||||
"""
|
||||
Generate LLMs.txt for a given URL and poll until completion.
|
||||
@ -1878,6 +1880,7 @@ class FirecrawlApp:
|
||||
url (str): Target URL to generate LLMs.txt from
|
||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||
|
||||
Returns:
|
||||
@ -1893,6 +1896,7 @@ class FirecrawlApp:
|
||||
params = GenerateLLMsTextParams(
|
||||
maxUrls=max_urls,
|
||||
showFullText=show_full_text,
|
||||
cache=cache,
|
||||
__experimental_stream=experimental_stream
|
||||
)
|
||||
|
||||
@ -1900,6 +1904,7 @@ class FirecrawlApp:
|
||||
url,
|
||||
max_urls=max_urls,
|
||||
show_full_text=show_full_text,
|
||||
cache=cache,
|
||||
experimental_stream=experimental_stream
|
||||
)
|
||||
|
||||
@ -1935,6 +1940,7 @@ class FirecrawlApp:
|
||||
*,
|
||||
max_urls: Optional[int] = None,
|
||||
show_full_text: Optional[bool] = None,
|
||||
cache: Optional[bool] = None,
|
||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
||||
"""
|
||||
Initiate an asynchronous LLMs.txt generation operation.
|
||||
@ -1943,6 +1949,7 @@ class FirecrawlApp:
|
||||
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||
|
||||
Returns:
|
||||
@ -1957,6 +1964,7 @@ class FirecrawlApp:
|
||||
params = GenerateLLMsTextParams(
|
||||
maxUrls=max_urls,
|
||||
showFullText=show_full_text,
|
||||
cache=cache,
|
||||
__experimental_stream=experimental_stream
|
||||
)
|
||||
|
||||
@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
url,
|
||||
max_urls=max_urls,
|
||||
show_full_text=show_full_text,
|
||||
cache=cache,
|
||||
experimental_stream=experimental_stream
|
||||
)
|
||||
if not response.get('success') or 'id' not in response:
|
||||
@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
*,
|
||||
max_urls: Optional[int] = None,
|
||||
show_full_text: Optional[bool] = None,
|
||||
cache: Optional[bool] = None,
|
||||
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
||||
"""
|
||||
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
||||
@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
url (str): Target URL to generate LLMs.txt from
|
||||
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
||||
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||
|
||||
Returns:
|
||||
@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
params = GenerateLLMsTextParams(
|
||||
maxUrls=max_urls,
|
||||
showFullText=show_full_text,
|
||||
cache=cache,
|
||||
__experimental_stream=experimental_stream
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user