2024-12-26 12:41:37 -03:00
import { MapDocument , URLTrace } from "../../controllers/v1/types" ;
import { performRanking } from "../ranker" ;
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist" ;
import { logger } from "../logger" ;
2024-10-28 16:02:07 -03:00
import { CohereClient } from "cohere-ai" ;
2024-12-31 18:06:07 -03:00
import { extractConfig } from "./config" ;
2025-01-13 22:30:15 -03:00
import { searchSimilarPages } from "./index/pinecone" ;
2025-02-20 18:48:58 -03:00
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract" ;
2025-01-22 17:26:32 -03:00
import { buildRerankerUserPrompt } from "./build-prompts" ;
import { buildRerankerSystemPrompt } from "./build-prompts" ;
2025-01-24 18:09:25 -03:00
import { dumpToFile } from "./helpers/dump-to-file" ;
2025-04-15 00:19:45 -07:00
import { getModel } from "../generic-ai" ;
import fs from "fs/promises" ;
2025-04-17 09:23:53 -07:00
import { CostTracking } from "./extraction-service" ;
2025-04-15 00:19:45 -07:00
const THRESHOLD_FOR_SINGLEPAGE = 0.6 ;
const THRESHOLD_FOR_MULTIENTITY = 0.45 ;
2024-12-26 12:41:37 -03:00
2024-10-28 16:02:07 -03:00
const cohere = new CohereClient ( {
2024-12-11 19:51:08 -03:00
token : process.env.COHERE_API_KEY ,
2024-10-28 16:02:07 -03:00
} ) ;
2024-12-26 12:41:37 -03:00
interface RankingResult {
mappedLinks : MapDocument [ ] ;
linksAndScores : {
link : string ;
linkWithContext : string ;
score : number ;
originalIndex : number ;
} [ ] ;
}
2024-10-28 16:02:07 -03:00
export async function rerankDocuments (
documents : ( string | Record < string , string > ) [ ] ,
query : string ,
topN = 3 ,
2024-12-11 19:51:08 -03:00
model = "rerank-english-v3.0" ,
2024-10-28 16:02:07 -03:00
) {
const rerank = await cohere . v2 . rerank ( {
documents ,
query ,
topN ,
model ,
2024-12-11 19:51:08 -03:00
returnDocuments : true ,
2024-10-28 16:02:07 -03:00
} ) ;
2024-12-11 19:46:11 -03:00
return rerank . results
. sort ( ( a , b ) = > b . relevanceScore - a . relevanceScore )
. map ( ( x ) = > ( {
document : x . document ,
index : x.index ,
2024-12-11 19:51:08 -03:00
relevanceScore : x.relevanceScore ,
2024-12-11 19:46:11 -03:00
} ) ) ;
2024-10-28 16:02:07 -03:00
}
2024-12-26 12:41:37 -03:00
export async function rerankLinks (
mappedLinks : MapDocument [ ] ,
searchQuery : string ,
urlTraces : URLTrace [ ] ,
) : Promise < MapDocument [ ] > {
2025-01-13 22:30:15 -03:00
// console.log("Going to rerank links");
2024-12-26 12:41:37 -03:00
const mappedLinksRerank = mappedLinks . map (
( x ) = > ` url: ${ x . url } , title: ${ x . title } , description: ${ x . description } ` ,
) ;
const linksAndScores = await performRanking (
mappedLinksRerank ,
mappedLinks . map ( ( l ) = > l . url ) ,
2025-01-22 18:47:44 -03:00
searchQuery ,
2024-12-26 12:41:37 -03:00
) ;
// First try with high threshold
let filteredLinks = filterAndProcessLinks (
mappedLinks ,
linksAndScores ,
2025-01-13 22:30:15 -03:00
extractConfig . RERANKING . INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE ,
2024-12-26 12:41:37 -03:00
) ;
// If we don't have enough high-quality links, try with lower threshold
2025-01-13 22:30:15 -03:00
if ( filteredLinks . length < extractConfig . RERANKING . MIN_REQUIRED_LINKS ) {
2024-12-26 12:41:37 -03:00
logger . info (
2025-01-13 22:30:15 -03:00
` Only found ${ filteredLinks . length } links with score > ${ extractConfig . RERANKING . INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE } . Trying lower threshold... ` ,
2024-12-26 12:41:37 -03:00
) ;
filteredLinks = filterAndProcessLinks (
mappedLinks ,
linksAndScores ,
2025-01-13 22:30:15 -03:00
extractConfig . RERANKING . FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE ,
2024-12-26 12:41:37 -03:00
) ;
if ( filteredLinks . length === 0 ) {
// If still no results, take top N results regardless of score
logger . warn (
2025-01-13 22:30:15 -03:00
` No links found with score > ${ extractConfig . RERANKING . FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE } . Taking top ${ extractConfig . RERANKING . MIN_REQUIRED_LINKS } results. ` ,
2024-12-26 12:41:37 -03:00
) ;
filteredLinks = linksAndScores
. sort ( ( a , b ) = > b . score - a . score )
2025-01-13 22:30:15 -03:00
. slice ( 0 , extractConfig . RERANKING . MIN_REQUIRED_LINKS )
2024-12-26 12:41:37 -03:00
. map ( ( x ) = > mappedLinks . find ( ( link ) = > link . url === x . link ) )
. filter (
( x ) : x is MapDocument = >
x !== undefined && x . url !== undefined && ! isUrlBlocked ( x . url ) ,
) ;
}
}
// Update URL traces with relevance scores and mark filtered out URLs
linksAndScores . forEach ( ( score ) = > {
const trace = urlTraces . find ( ( t ) = > t . url === score . link ) ;
if ( trace ) {
trace . relevanceScore = score . score ;
// If URL didn't make it through filtering, mark it as filtered out
2025-01-10 18:35:10 -03:00
if ( ! filteredLinks . some ( ( link ) = > link . url === score . link ) ) {
2024-12-26 12:41:37 -03:00
trace . warning = ` Relevance score ${ score . score } below threshold ` ;
trace . usedInCompletion = false ;
}
}
} ) ;
2025-01-22 18:47:44 -03:00
const rankedLinks = filteredLinks . slice (
0 ,
extractConfig . RERANKING . MAX_RANKING_LIMIT_FOR_RELEVANCE ,
) ;
2024-12-26 12:41:37 -03:00
// Mark URLs that will be used in completion
2025-01-10 18:35:10 -03:00
rankedLinks . forEach ( ( link ) = > {
const trace = urlTraces . find ( ( t ) = > t . url === link . url ) ;
2024-12-26 12:41:37 -03:00
if ( trace ) {
trace . usedInCompletion = true ;
}
} ) ;
// Mark URLs that were dropped due to ranking limit
2025-01-22 18:47:44 -03:00
filteredLinks
. slice ( extractConfig . RERANKING . MAX_RANKING_LIMIT_FOR_RELEVANCE )
. forEach ( ( link ) = > {
const trace = urlTraces . find ( ( t ) = > t . url === link . url ) ;
if ( trace ) {
trace . warning = "Excluded due to ranking limit" ;
trace . usedInCompletion = false ;
}
} ) ;
2024-12-26 12:41:37 -03:00
2025-01-13 22:30:15 -03:00
// console.log("Reranked links: ", rankedLinks.length);
2024-12-26 12:41:37 -03:00
return rankedLinks ;
}
function filterAndProcessLinks (
mappedLinks : MapDocument [ ] ,
linksAndScores : {
link : string ;
linkWithContext : string ;
score : number ;
originalIndex : number ;
} [ ] ,
threshold : number ,
) : MapDocument [ ] {
return linksAndScores
. filter ( ( x ) = > x . score > threshold )
. map ( ( x ) = > mappedLinks . find ( ( link ) = > link . url === x . link ) )
. filter (
( x ) : x is MapDocument = >
x !== undefined && x . url !== undefined && ! isUrlBlocked ( x . url ) ,
) ;
}
2025-01-13 22:30:15 -03:00
2025-01-19 22:04:12 -03:00
export type RerankerResult = {
2025-01-24 18:09:25 -03:00
mapDocument : ( MapDocument & { relevanceScore? : number ; reason? : string } ) [ ] ;
2025-01-19 22:04:12 -03:00
tokensUsed : number ;
2025-04-15 00:19:45 -07:00
cost : number ;
2025-01-22 18:47:44 -03:00
} ;
2025-01-13 22:30:15 -03:00
2025-01-24 09:08:16 -03:00
export type RerankerOptions = {
links : MapDocument [ ] ;
searchQuery : string ;
urlTraces : URLTrace [ ] ;
2025-04-15 00:19:45 -07:00
isMultiEntity : boolean ;
reasoning : string ;
multiEntityKeys : string [ ] ;
keyIndicators : string [ ] ;
2025-04-17 09:23:53 -07:00
costTracking : CostTracking ;
2025-01-24 09:08:16 -03:00
} ;
2025-04-15 00:19:45 -07:00
export async function rerankLinksWithLLM (
options : RerankerOptions ,
) : Promise < RerankerResult > {
const {
links ,
searchQuery ,
urlTraces ,
isMultiEntity ,
reasoning ,
multiEntityKeys ,
keyIndicators ,
} = options ;
const chunkSize = 5000 ;
2025-01-13 22:30:15 -03:00
const chunks : MapDocument [ ] [ ] = [ ] ;
2025-04-15 00:19:45 -07:00
const TIMEOUT_MS = 60000 ;
2025-01-13 22:30:15 -03:00
const MAX_RETRIES = 2 ;
2025-01-19 22:04:12 -03:00
let totalTokensUsed = 0 ;
2025-01-22 18:47:44 -03:00
2025-04-15 00:19:45 -07:00
// await fs.writeFile(
// `logs/links-${crypto.randomUUID()}.txt`,
// JSON.stringify(links, null, 2),
// );
2025-01-24 09:08:16 -03:00
// Split links into chunks of 200
for ( let i = 0 ; i < links . length ; i += chunkSize ) {
chunks . push ( links . slice ( i , i + chunkSize ) ) ;
2025-01-13 22:30:15 -03:00
}
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
const schema = {
type : "object" ,
properties : {
relevantLinks : {
type : "array" ,
items : {
type : "object" ,
properties : {
url : { type : "string" } ,
2025-01-22 18:47:44 -03:00
relevanceScore : { type : "number" } ,
2025-04-15 00:19:45 -07:00
reason : {
type : "string" ,
description :
"The reason why you chose the score for this link given the intent." ,
} ,
2025-01-13 22:30:15 -03:00
} ,
2025-01-24 09:08:16 -03:00
required : [ "url" , "relevanceScore" , "reason" ] ,
2025-01-22 18:47:44 -03:00
} ,
} ,
2025-01-13 22:30:15 -03:00
} ,
2025-01-22 18:47:44 -03:00
required : [ "relevantLinks" ] ,
2025-01-13 22:30:15 -03:00
} ;
2025-04-15 00:19:45 -07:00
let totalCost = 0 ;
2025-01-13 22:30:15 -03:00
const results = await Promise . all (
chunks . map ( async ( chunk , chunkIndex ) = > {
// console.log(`Processing chunk ${chunkIndex + 1}/${chunks.length} with ${chunk.length} links`);
2025-01-22 18:47:44 -03:00
const linksContent = chunk
. map (
( link ) = >
` URL: ${ link . url } ${ link . title ? ` \ nTitle: ${ link . title } ` : "" } ${ link . description ? ` \ nDescription: ${ link . description } ` : "" } ` ,
)
. join ( "\n\n" ) ;
2025-01-13 22:30:15 -03:00
2025-04-15 00:19:45 -07:00
// fs.writeFile(
// `logs/links-content-${crypto.randomUUID()}.txt`,
// linksContent,
// );
2025-01-13 22:30:15 -03:00
for ( let retry = 0 ; retry <= MAX_RETRIES ; retry ++ ) {
try {
const timeoutPromise = new Promise < null > ( ( resolve ) = > {
setTimeout ( ( ) = > resolve ( null ) , TIMEOUT_MS ) ;
} ) ;
2025-04-15 00:19:45 -07:00
const systemPrompt = ` You are analyzing URLs for ${ isMultiEntity ? "collecting multiple items" : "specific information" } .
The user ' s query is : $ { searchQuery }
$ {
isMultiEntity
? ` IMPORTANT: This is a multi-entity extraction task looking for ${ multiEntityKeys . join ( ", " ) } .
Score URLs higher if they contain ANY instance of the target entities .
Key indicators to look for : $ { keyIndicators . join ( ", " ) } `
: ` IMPORTANT: This is a specific information task.
Score URLs based on precision and relevance to answering the query . `
}
Scoring guidelines :
$ {
isMultiEntity
? `
- 1.0 : Contains ANY instance of target entities , even just one . Give this score if page has any relevant entity . If you are not sure if this page is relevant or not , give it a score of 1.0
- 0.8 : Contains entity but may be incomplete information
- 0.6 : Mentions entity type but no clear instance
- 0.4 : Only tangentially related to entity type
- Below 0.4 : No mention of relevant entities , or duplicates
Reason : $ { reasoning }
`
: `
- 1.0 : Contains direct , authoritative answer to query . Give this score if unsure about relevance . If you are not sure if this page is relevant or not , give it a score of 1.0
- 0.8 : Contains information that directly helps answer the query
- 0.6 : Contains related information that partially answers query
- Below 0.6 : Information too general or not focused on query
`
} ` ;
2025-01-24 18:09:25 -03:00
// dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent])
2025-04-15 00:19:45 -07:00
// const gemini = getGemini();
// const model = getGemini()
let completion : any ;
try {
const completionPromise = generateCompletions ( {
model : getModel ( "gemini-2.5-pro-preview-03-25" , "vertex" ) ,
retryModel : getModel ( "gemini-2.5-pro-preview-03-25" , "google" ) ,
logger : logger.child ( {
method : "rerankLinksWithLLM" ,
chunk : chunkIndex + 1 ,
retry ,
} ) ,
options : {
mode : "llm" ,
systemPrompt : systemPrompt ,
prompt : buildRerankerUserPrompt ( searchQuery ) ,
schema : schema ,
// temperature: isMultiEntity ? 0.5 : 0.3,
} ,
// providerOptions: {
// anthropic: {
// thinking: { type: 'enabled', budgetTokens: 12000 },
// tool_choice: "auto",
// },
// },
markdown : linksContent ,
isExtractEndpoint : true ,
2025-04-17 09:23:53 -07:00
costTrackingOptions : {
costTracking : options.costTracking ,
metadata : {
module : "extract" ,
method : "rerankLinksWithLLM" ,
} ,
} ,
2025-04-15 00:19:45 -07:00
} ) ;
completion = await completionPromise ;
totalCost += completion . cost ;
} catch ( error ) {
console . warn (
` Error processing chunk ${ chunkIndex + 1 } attempt ${ retry + 1 } : ` ,
error ,
) ;
}
2025-01-13 22:30:15 -03:00
2025-04-15 00:19:45 -07:00
// await fs.writeFile(
// `logs/reranker-${crypto.randomUUID()}.json`,
// JSON.stringify(completion, null, 2),
// );
2025-01-22 18:47:44 -03:00
2025-01-13 22:30:15 -03:00
if ( ! completion ) {
// console.log(`Chunk ${chunkIndex + 1}: Timeout on attempt ${retry + 1}`);
continue ;
}
if ( ! completion . extract ? . relevantLinks ) {
// console.warn(`Chunk ${chunkIndex + 1}: No relevant links found in completion response`);
return [ ] ;
}
2025-01-19 22:04:12 -03:00
totalTokensUsed += completion . numTokens || 0 ;
2025-01-13 22:30:15 -03:00
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
return completion . extract . relevantLinks ;
} catch ( error ) {
2025-01-22 18:47:44 -03:00
console . warn (
` Error processing chunk ${ chunkIndex + 1 } attempt ${ retry + 1 } : ` ,
error ,
) ;
2025-01-13 22:30:15 -03:00
if ( retry === MAX_RETRIES ) {
// console.log(`Chunk ${chunkIndex + 1}: Max retries reached, returning empty array`);
return [ ] ;
}
}
}
return [ ] ;
2025-01-22 18:47:44 -03:00
} ) ,
2025-01-13 22:30:15 -03:00
) ;
// console.log(`Processed ${results.length} chunks`);
// Flatten results and sort by relevance score
2025-01-22 18:47:44 -03:00
const flattenedResults = results
. flat ( )
. sort ( ( a , b ) = > b . relevanceScore - a . relevanceScore ) ;
2025-01-13 22:30:15 -03:00
// console.log(`Total relevant links found: ${flattenedResults.length}`);
2025-04-15 00:19:45 -07:00
// Map back to MapDocument format, keeping ALL links for testing
2025-01-13 22:30:15 -03:00
const relevantLinks = flattenedResults
2025-01-24 09:08:16 -03:00
. map ( ( result ) = > {
2025-04-15 00:19:45 -07:00
if (
result . relevanceScore >
( isMultiEntity ? THRESHOLD_FOR_MULTIENTITY : THRESHOLD_FOR_SINGLEPAGE )
) {
const link = links . find ( ( link ) = > link . url === result . url ) ;
if ( link ) {
return {
. . . link ,
relevanceScore : result.relevanceScore
? parseFloat ( result . relevanceScore )
: 0 ,
reason : result.reason ,
} ;
}
2025-01-24 09:08:16 -03:00
}
return undefined ;
} )
. filter ( ( link ) : link is NonNullable < typeof link > = > link !== undefined ) ;
2025-01-13 22:30:15 -03:00
2025-04-15 00:19:45 -07:00
// Add debug logging for testing
// fs.writeFile(
// `logs/reranker-aaa-${crypto.randomUUID()}.json`,
// JSON.stringify(
// {
// totalResults: relevantLinks.length,
// scores: relevantLinks.map((l) => ({
// url: l.url,
// score: l.relevanceScore,
// reason: l.reason,
// })),
// },
// null,
// 2,
// ),
// );
2025-01-19 22:04:12 -03:00
return {
mapDocument : relevantLinks ,
tokensUsed : totalTokensUsed ,
2025-04-15 00:19:45 -07:00
cost : totalCost ,
2025-01-19 22:04:12 -03:00
} ;
}