mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
fix: implement per-document cost tracking architecture
- Create DocumentWithCostTracking interface for proper return type - Modify scrapeSearchResult to return individual costTracking instances - Update billing logic to use separate costTracking per document - Fix deep research service to handle new interface structure - Add stealth mode and PDF billing test cases - Prevents shared state accumulation in billing calculations Addresses GitHub comments from mogery and micahstairs: - Reworks cost tracking to return separate instances per document - Restores bypassBilling: false flag as requested - Implements proper per-document billing calculation Co-Authored-By: Micah Stairs <micah@sideguide.dev>
This commit is contained in:
parent
9336e275f4
commit
90469270e0
@ -24,6 +24,11 @@ import type { Logger } from "winston";
|
|||||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
import { calculateCreditsToBeBilled } from "../../lib/scrape-billing";
|
import { calculateCreditsToBeBilled } from "../../lib/scrape-billing";
|
||||||
|
|
||||||
|
interface DocumentWithCostTracking {
|
||||||
|
document: Document;
|
||||||
|
costTracking: CostTracking;
|
||||||
|
}
|
||||||
|
|
||||||
// Used for deep research
|
// Used for deep research
|
||||||
export async function searchAndScrapeSearchResult(
|
export async function searchAndScrapeSearchResult(
|
||||||
query: string,
|
query: string,
|
||||||
@ -34,16 +39,15 @@ export async function searchAndScrapeSearchResult(
|
|||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
|
||||||
flags: TeamFlags,
|
flags: TeamFlags,
|
||||||
): Promise<Document[]> {
|
): Promise<DocumentWithCostTracking[]> {
|
||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
query,
|
query,
|
||||||
num_results: 5,
|
num_results: 5,
|
||||||
});
|
});
|
||||||
|
|
||||||
const documents = await Promise.all(
|
const documentsWithCostTracking = await Promise.all(
|
||||||
searchResults.map((result) =>
|
searchResults.map((result) =>
|
||||||
scrapeSearchResult(
|
scrapeSearchResult(
|
||||||
{
|
{
|
||||||
@ -53,13 +57,12 @@ export async function searchAndScrapeSearchResult(
|
|||||||
},
|
},
|
||||||
options,
|
options,
|
||||||
logger,
|
logger,
|
||||||
costTracking,
|
|
||||||
flags,
|
flags,
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
return documents;
|
return documentsWithCostTracking;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@ -74,17 +77,18 @@ async function scrapeSearchResult(
|
|||||||
scrapeOptions: ScrapeOptions;
|
scrapeOptions: ScrapeOptions;
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
|
||||||
flags: TeamFlags,
|
flags: TeamFlags,
|
||||||
directToBullMQ: boolean = false,
|
directToBullMQ: boolean = false,
|
||||||
isSearchPreview: boolean = false,
|
isSearchPreview: boolean = false,
|
||||||
): Promise<Document> {
|
): Promise<DocumentWithCostTracking> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
team_id: options.teamId,
|
team_id: options.teamId,
|
||||||
basePriority: 10,
|
basePriority: 10,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const costTracking = new CostTracking();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (isUrlBlocked(searchResult.url, flags)) {
|
if (isUrlBlocked(searchResult.url, flags)) {
|
||||||
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
||||||
@ -104,7 +108,7 @@ async function scrapeSearchResult(
|
|||||||
...options.scrapeOptions,
|
...options.scrapeOptions,
|
||||||
maxAge: 4 * 60 * 60 * 1000,
|
maxAge: 4 * 60 * 60 * 1000,
|
||||||
},
|
},
|
||||||
internalOptions: { teamId: options.teamId, bypassBilling: true },
|
internalOptions: { teamId: options.teamId, bypassBilling: false },
|
||||||
origin: options.origin,
|
origin: options.origin,
|
||||||
is_scrape: true,
|
is_scrape: true,
|
||||||
startTime: Date.now(),
|
startTime: Date.now(),
|
||||||
@ -117,6 +121,19 @@ async function scrapeSearchResult(
|
|||||||
|
|
||||||
const doc: Document = await waitForJob(jobId, options.timeout);
|
const doc: Document = await waitForJob(jobId, options.timeout);
|
||||||
|
|
||||||
|
const actualCostTracking = new CostTracking();
|
||||||
|
const credits = await calculateCreditsToBeBilled(options.scrapeOptions, doc, actualCostTracking);
|
||||||
|
actualCostTracking.addCall({
|
||||||
|
type: "other",
|
||||||
|
metadata: {
|
||||||
|
module: "search",
|
||||||
|
operation: "scrape",
|
||||||
|
url: searchResult.url
|
||||||
|
},
|
||||||
|
cost: credits,
|
||||||
|
model: "search-scrape"
|
||||||
|
});
|
||||||
|
|
||||||
logger.info("Scrape job completed", {
|
logger.info("Scrape job completed", {
|
||||||
scrapeId: jobId,
|
scrapeId: jobId,
|
||||||
url: searchResult.url,
|
url: searchResult.url,
|
||||||
@ -125,13 +142,17 @@ async function scrapeSearchResult(
|
|||||||
});
|
});
|
||||||
await getScrapeQueue().remove(jobId);
|
await getScrapeQueue().remove(jobId);
|
||||||
|
|
||||||
// Move SERP results to top level
|
const document = {
|
||||||
return {
|
|
||||||
title: searchResult.title,
|
title: searchResult.title,
|
||||||
description: searchResult.description,
|
description: searchResult.description,
|
||||||
url: searchResult.url,
|
url: searchResult.url,
|
||||||
...doc,
|
...doc,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
document,
|
||||||
|
costTracking: actualCostTracking,
|
||||||
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
||||||
scrapeId: jobId,
|
scrapeId: jobId,
|
||||||
@ -143,8 +164,8 @@ async function scrapeSearchResult(
|
|||||||
if (error?.message?.includes("Could not scrape url")) {
|
if (error?.message?.includes("Could not scrape url")) {
|
||||||
statusCode = 403;
|
statusCode = 403;
|
||||||
}
|
}
|
||||||
// Return a minimal document with SERP results at top level
|
|
||||||
return {
|
const document: Document = {
|
||||||
title: searchResult.title,
|
title: searchResult.title,
|
||||||
description: searchResult.description,
|
description: searchResult.description,
|
||||||
url: searchResult.url,
|
url: searchResult.url,
|
||||||
@ -154,6 +175,11 @@ async function scrapeSearchResult(
|
|||||||
proxyUsed: "basic",
|
proxyUsed: "basic",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
document,
|
||||||
|
costTracking: new CostTracking(),
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -174,9 +200,11 @@ export async function searchController(
|
|||||||
data: [],
|
data: [],
|
||||||
};
|
};
|
||||||
const startTime = new Date().getTime();
|
const startTime = new Date().getTime();
|
||||||
const costTracking = new CostTracking();
|
|
||||||
const isSearchPreview = process.env.SEARCH_PREVIEW_TOKEN !== undefined && process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
const isSearchPreview = process.env.SEARCH_PREVIEW_TOKEN !== undefined && process.env.SEARCH_PREVIEW_TOKEN === req.body.__searchPreviewToken;
|
||||||
|
|
||||||
|
let credits_billed = 0;
|
||||||
|
let allDocsWithCostTracking: DocumentWithCostTracking[] = [];
|
||||||
|
|
||||||
try {
|
try {
|
||||||
req.body = searchRequestSchema.parse(req.body);
|
req.body = searchRequestSchema.parse(req.body);
|
||||||
|
|
||||||
@ -242,18 +270,18 @@ export async function searchController(
|
|||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
},
|
},
|
||||||
logger,
|
logger,
|
||||||
costTracking,
|
|
||||||
req.acuc?.flags ?? null,
|
req.acuc?.flags ?? null,
|
||||||
(req.acuc?.price_credits ?? 0) <= 3000,
|
(req.acuc?.price_credits ?? 0) <= 3000,
|
||||||
isSearchPreview,
|
isSearchPreview,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docsWithCostTracking = await Promise.all(scrapePromises);
|
||||||
logger.info("Scraping completed", {
|
logger.info("Scraping completed", {
|
||||||
num_docs: docs.length,
|
num_docs: docsWithCostTracking.length,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const docs = docsWithCostTracking.map(item => item.document);
|
||||||
const filteredDocs = docs.filter(
|
const filteredDocs = docs.filter(
|
||||||
(doc) =>
|
(doc) =>
|
||||||
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
||||||
@ -269,18 +297,34 @@ export async function searchController(
|
|||||||
} else {
|
} else {
|
||||||
responseData.data = filteredDocs;
|
responseData.data = filteredDocs;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
let credits_billed = 0;
|
const finalDocsForBilling = responseData.data;
|
||||||
try {
|
|
||||||
credits_billed = await Promise.all(
|
const creditPromises = finalDocsForBilling.map(async (finalDoc) => {
|
||||||
responseData.data.map(async (document) => {
|
const matchingDocWithCost = docsWithCostTracking.find(item =>
|
||||||
return await calculateCreditsToBeBilled(req.body.scrapeOptions, document, costTracking);
|
item.document.url === finalDoc.url
|
||||||
})
|
);
|
||||||
).then(credits => credits.reduce((sum, credit) => sum + credit, 0));
|
|
||||||
} catch (error) {
|
if (matchingDocWithCost) {
|
||||||
logger.error("Error calculating credits for billing", { error });
|
return await calculateCreditsToBeBilled(
|
||||||
credits_billed = responseData.data.length;
|
req.body.scrapeOptions,
|
||||||
|
matchingDocWithCost.document,
|
||||||
|
matchingDocWithCost.costTracking
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const individualCredits = await Promise.all(creditPromises);
|
||||||
|
credits_billed = individualCredits.reduce((sum, credit) => sum + credit, 0);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Error calculating credits for billing", { error });
|
||||||
|
credits_billed = responseData.data.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
allDocsWithCostTracking = docsWithCostTracking;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bill team once for all successful results
|
// Bill team once for all successful results
|
||||||
@ -317,7 +361,7 @@ export async function searchController(
|
|||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
integration: req.body.integration,
|
integration: req.body.integration,
|
||||||
cost_tracking: costTracking,
|
cost_tracking: allDocsWithCostTracking.length > 0 ? allDocsWithCostTracking[0].costTracking : new CostTracking(),
|
||||||
credits_billed,
|
credits_billed,
|
||||||
},
|
},
|
||||||
false,
|
false,
|
||||||
|
@ -134,7 +134,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
maxAge: 4 * 60 * 60 * 1000,
|
maxAge: 4 * 60 * 60 * 1000,
|
||||||
storeInCache: true,
|
storeInCache: true,
|
||||||
},
|
},
|
||||||
}, logger, costTracking, acuc?.flags ?? null);
|
}, logger, acuc?.flags ?? null);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -164,10 +164,10 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
// Filter out already seen URLs and track new ones
|
// Filter out already seen URLs and track new ones
|
||||||
const newSearchResults: typeof searchResults = [];
|
const newSearchResults: typeof searchResults = [];
|
||||||
for (const result of searchResults) {
|
for (const result of searchResults) {
|
||||||
if (!result.url || state.hasSeenUrl(result.url)) {
|
if (!result.document.url || state.hasSeenUrl(result.document.url)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
state.addSeenUrl(result.url);
|
state.addSeenUrl(result.document.url);
|
||||||
|
|
||||||
urlsAnalyzed++;
|
urlsAnalyzed++;
|
||||||
if (urlsAnalyzed >= maxUrls) {
|
if (urlsAnalyzed >= maxUrls) {
|
||||||
@ -183,10 +183,10 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await state.addSources(newSearchResults.map((result) => ({
|
await state.addSources(newSearchResults.map((result) => ({
|
||||||
url: result.url ?? "",
|
url: result.document.url ?? "",
|
||||||
title: result.title ?? "",
|
title: result.document.title ?? "",
|
||||||
description: result.description ?? "",
|
description: result.document.description ?? "",
|
||||||
icon: result.metadata?.favicon ?? "",
|
icon: result.document.metadata?.favicon ?? "",
|
||||||
})));
|
})));
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"[Deep Research] New unique results count:",
|
"[Deep Research] New unique results count:",
|
||||||
@ -218,8 +218,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
|
|
||||||
await state.addFindings(
|
await state.addFindings(
|
||||||
newSearchResults.map((result) => ({
|
newSearchResults.map((result) => ({
|
||||||
text: result.markdown ?? "",
|
text: result.document.markdown ?? "",
|
||||||
source: result.url ?? "",
|
source: result.document.url ?? "",
|
||||||
})),
|
})),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user