mirror of
				https://github.com/mendableai/firecrawl.git
				synced 2025-10-31 10:02:55 +00:00 
			
		
		
		
	Nick: __experimental_streamSteps
This commit is contained in:
		
							parent
							
								
									558a7f4c08
								
							
						
					
					
						commit
						033e9bbf29
					
				| @ -36,5 +36,6 @@ export async function extractStatusController( | ||||
|     status: extract.status, | ||||
|     error: extract?.error ?? undefined, | ||||
|     expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(), | ||||
|     steps: extract.showSteps ? extract.steps : undefined, | ||||
|   }); | ||||
| } | ||||
|  | ||||
| @ -70,6 +70,7 @@ export async function extractController( | ||||
|     plan: req.auth.plan, | ||||
|     createdAt: Date.now(), | ||||
|     status: "processing", | ||||
|     showSteps: req.body.__experimental_streamSteps, | ||||
|   }); | ||||
| 
 | ||||
|   if (Sentry.isInitialized()) { | ||||
|  | ||||
| @ -221,6 +221,7 @@ export const extractV1Options = z | ||||
|     allowExternalLinks: z.boolean().default(false), | ||||
|     origin: z.string().optional().default("api"), | ||||
|     urlTrace: z.boolean().default(false), | ||||
|     __experimental_streamSteps: z.boolean().default(false), | ||||
|     timeout: z.number().int().positive().finite().safe().default(60000), | ||||
|   }) | ||||
|   .strict(strictMessage); | ||||
|  | ||||
| @ -1,6 +1,25 @@ | ||||
| import { redisConnection } from "../../services/queue-service"; | ||||
| import { logger as _logger } from "../logger"; | ||||
| 
 | ||||
| export enum ExtractStep { | ||||
|   INITIAL = "initial", | ||||
|   MULTI_ENTITY = "multi-entity", | ||||
|   MULTI_ENTITY_SCRAPE = "multi-entity-scrape", | ||||
|   MULTI_ENTITY_EXTRACT = "multi-entity-extract", | ||||
|   SCRAPE = "scrape", | ||||
|   MAP = "map", | ||||
|   EXTRACT = "extract", | ||||
|   COMPLETE = "complete", | ||||
| } | ||||
| 
 | ||||
| export type ExtractedStep = { | ||||
|   step: ExtractStep; | ||||
|   startedAt: number; | ||||
|   finishedAt: number; | ||||
|   error?: any; | ||||
|   discoveredLinks?: string[]; | ||||
| }; | ||||
| 
 | ||||
| export type StoredExtract = { | ||||
|   id: string; | ||||
|   team_id: string; | ||||
| @ -8,6 +27,8 @@ export type StoredExtract = { | ||||
|   createdAt: number; | ||||
|   status: "processing" | "completed" | "failed" | "cancelled"; | ||||
|   error?: any; | ||||
|   showSteps?: boolean; | ||||
|   steps?: ExtractedStep[]; | ||||
| }; | ||||
| 
 | ||||
| export async function saveExtract(id: string, extract: StoredExtract) { | ||||
| @ -27,6 +48,12 @@ export async function updateExtract( | ||||
| ) { | ||||
|   const current = await getExtract(id); | ||||
|   if (!current) return; | ||||
| 
 | ||||
|   // Handle steps aggregation
 | ||||
|   if (extract.steps && current.steps) { | ||||
|     extract.steps = [...current.steps, ...extract.steps]; | ||||
|   } | ||||
| 
 | ||||
|   await redisConnection.set( | ||||
|     "extract:" + id, | ||||
|     JSON.stringify({ ...current, ...extract }), | ||||
|  | ||||
| @ -24,7 +24,7 @@ import Ajv from "ajv"; | ||||
| const ajv = new Ajv(); | ||||
| 
 | ||||
| const openai = new OpenAI(); | ||||
| import { updateExtract } from "./extract-redis"; | ||||
| import { ExtractStep, updateExtract } from "./extract-redis"; | ||||
| import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array"; | ||||
| import { mergeNullValObjs } from "./helpers/merge-null-val-objs"; | ||||
| import { CUSTOM_U_TEAMS } from "./config"; | ||||
| @ -157,6 +157,19 @@ export async function performExtraction( | ||||
|   let multiEntityCompletions: completions[] = []; | ||||
|   let multiEntityResult: any = {}; | ||||
|   let singleAnswerResult: any = {}; | ||||
| 
 | ||||
|   await updateExtract(extractId, { | ||||
|     status: "processing", | ||||
|     steps: [ | ||||
|       { | ||||
|         step: ExtractStep.INITIAL, | ||||
|         startedAt: Date.now(), | ||||
|         finishedAt: Date.now(), | ||||
|         discoveredLinks: request.urls, | ||||
|       }, | ||||
|     ], | ||||
|   }); | ||||
| 
 | ||||
|   // Process URLs
 | ||||
|   const urlPromises = request.urls.map((url) => | ||||
|     processUrl( | ||||
| @ -188,6 +201,18 @@ export async function performExtraction( | ||||
|     }; | ||||
|   } | ||||
| 
 | ||||
|   await updateExtract(extractId, { | ||||
|     status: "processing", | ||||
|     steps: [ | ||||
|       { | ||||
|         step: ExtractStep.MAP, | ||||
|         startedAt: Date.now(), | ||||
|         finishedAt: Date.now(), | ||||
|         discoveredLinks: links, | ||||
|       }, | ||||
|     ], | ||||
|   }); | ||||
| 
 | ||||
|   let reqSchema = request.schema; | ||||
|   reqSchema = await dereferenceSchema(reqSchema); | ||||
| 
 | ||||
| @ -209,8 +234,32 @@ export async function performExtraction( | ||||
|     const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(reqSchema, multiEntityKeys) | ||||
|     rSchema = singleAnswerSchema; | ||||
| 
 | ||||
|     await updateExtract(extractId, { | ||||
|       status: "processing", | ||||
|       steps: [ | ||||
|         { | ||||
|           step: ExtractStep.MULTI_ENTITY, | ||||
|           startedAt: Date.now(), | ||||
|           finishedAt: Date.now(), | ||||
|           discoveredLinks: [], | ||||
|         }, | ||||
|       ], | ||||
|     }); | ||||
| 
 | ||||
| 
 | ||||
|     const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000; | ||||
| 
 | ||||
|     await updateExtract(extractId, { | ||||
|       status: "processing", | ||||
|       steps: [ | ||||
|         { | ||||
|           step: ExtractStep.MULTI_ENTITY_SCRAPE, | ||||
|           startedAt: Date.now(), | ||||
|           finishedAt: Date.now(), | ||||
|           discoveredLinks: [], | ||||
|         }, | ||||
|       ], | ||||
|     }); | ||||
|     const scrapePromises = links.map((url) => { | ||||
|       if (!docsMap.has(url)) { | ||||
|         return scrapeDocument( | ||||
| @ -298,6 +347,18 @@ export async function performExtraction( | ||||
|           }; | ||||
|           // console.log("schemaWithConfidence", schemaWithConfidence);
 | ||||
| 
 | ||||
|           await updateExtract(extractId, { | ||||
|             status: "processing", | ||||
|             steps: [ | ||||
|               { | ||||
|                 step: ExtractStep.MULTI_ENTITY_EXTRACT, | ||||
|                 startedAt: Date.now(), | ||||
|                 finishedAt: Date.now(), | ||||
|                 discoveredLinks: [doc.metadata.url || doc.metadata.sourceURL || ""], | ||||
|               }, | ||||
|             ], | ||||
|           }); | ||||
| 
 | ||||
|           const completionPromise = generateOpenAICompletions( | ||||
|             logger.child({ method: "extractService/generateOpenAICompletions" }), | ||||
|             { | ||||
| @ -386,6 +447,17 @@ export async function performExtraction( | ||||
| 
 | ||||
|     // let rerank = await rerankLinks(links.map((url) => ({ url })), request.prompt ?? JSON.stringify(request.schema), urlTraces);
 | ||||
| 
 | ||||
|     await updateExtract(extractId, { | ||||
|       status: "processing", | ||||
|       steps: [ | ||||
|         { | ||||
|           step: ExtractStep.SCRAPE, | ||||
|           startedAt: Date.now(), | ||||
|           finishedAt: Date.now(), | ||||
|           discoveredLinks: links, | ||||
|         }, | ||||
|       ], | ||||
|     }); | ||||
|     const scrapePromises = links.map((url) => { | ||||
|       if (!docsMap.has(url)) { | ||||
|         return scrapeDocument( | ||||
| @ -431,6 +503,18 @@ export async function performExtraction( | ||||
|       }; | ||||
|     } | ||||
| 
 | ||||
|     await updateExtract(extractId, { | ||||
|       status: "processing", | ||||
|       steps: [ | ||||
|         { | ||||
|           step: ExtractStep.EXTRACT, | ||||
|           startedAt: Date.now(), | ||||
|           finishedAt: Date.now(), | ||||
|           discoveredLinks: [], | ||||
|         }, | ||||
|       ], | ||||
|     }); | ||||
| 
 | ||||
|     // Generate completions
 | ||||
|     singleAnswerCompletions = await generateOpenAICompletions( | ||||
|       logger.child({ method: "extractService/generateOpenAICompletions" }), | ||||
|  | ||||
| @ -7,6 +7,8 @@ import { generateBasicCompletion } from "../LLM-extraction"; | ||||
| import { buildRefrasedPrompt } from "./build-prompts"; | ||||
| import { rerankLinksWithLLM } from "./reranker"; | ||||
| import { extractConfig } from "./config"; | ||||
| import { updateExtract } from "./extract-redis"; | ||||
| import { ExtractStep } from "./extract-redis"; | ||||
| 
 | ||||
| interface ProcessUrlOptions { | ||||
|   url: string; | ||||
| @ -157,6 +159,8 @@ export async function processUrl( | ||||
|       extractConfig.RERANKING.MAX_INITIAL_RANKING_LIMIT, | ||||
|     ); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     // Perform reranking using either prompt or schema
 | ||||
|     let searchQuery = ""; | ||||
|     if (options.prompt) { | ||||
|  | ||||
| @ -6,7 +6,7 @@ const saveMocksDirPath = path.join(__dirname, "../mocks/").replace("dist/", ""); | ||||
| const loadMocksDirPath = path.join(__dirname, "../../../__tests__/snips/mocks"); | ||||
| 
 | ||||
| export async function saveMock(options: unknown, result: unknown) { | ||||
|     if (!process.env.FIRECRAWL_SAVE_MOCKS) return; | ||||
|     if (process.env.FIRECRAWL_SAVE_MOCKS !== "true") return; | ||||
| 
 | ||||
|     await fs.mkdir(saveMocksDirPath, { recursive: true }); | ||||
| 
 | ||||
|  | ||||
| @ -930,12 +930,12 @@ export default class FirecrawlApp { | ||||
|    * @returns The response from the extract operation. | ||||
|    */ | ||||
|   async asyncExtract( | ||||
|     url: string, | ||||
|     urls: string[], | ||||
|     params?: ExtractParams, | ||||
|     idempotencyKey?: string | ||||
|   ): Promise<ExtractResponse | ErrorResponse> { | ||||
|     const headers = this.prepareHeaders(idempotencyKey); | ||||
|     let jsonData: any = { url, ...params }; | ||||
|     let jsonData: any = { urls, ...params }; | ||||
|     let jsonSchema: any; | ||||
| 
 | ||||
|     try { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Nicolas
						Nicolas