Nick: fixed

This commit is contained in:
Nicolas 2025-01-03 22:50:53 -03:00
parent a4f7c38834
commit c655c6859f
2 changed files with 63 additions and 41 deletions

View File

@ -0,0 +1,7 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}

View File

@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone"; import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types"; import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase"; import { supabase_service } from "../services/supabase";
import { normalizeUrl } from "../lib/canonical-url";
configDotenv(); configDotenv();
@ -78,44 +79,57 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) { if (await finishCrawl(job.data.crawl_id)) {
(async () => {
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
// Get all visited URLs from Redis // Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited"); const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) { if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
// Fire and forget the upload to Supabase
try { try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL // First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service const { data: existingMap } = await supabase_service
.from('crawl_maps') .from("crawl_maps")
.select('urls') .select("urls")
.eq('origin_url', sc.originUrl) .eq("origin_url", originUrl)
.single(); .single();
if (existingMap) { if (existingMap) {
// Merge URLs, removing duplicates // Merge URLs, removing duplicates
const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])]; const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];
const { error } = await supabase_service const { error } = await supabase_service
.from('crawl_maps') .from("crawl_maps")
.update({ .update({
urls: mergedUrls, urls: mergedUrls,
num_urls: mergedUrls.length, num_urls: mergedUrls.length,
updated_at: new Date().toISOString() updated_at: new Date().toISOString(),
}) })
.eq('origin_url', sc.originUrl); .eq("origin_url", originUrl);
if (error) { if (error) {
_logger.error("Failed to update crawl map", { error }); _logger.error("Failed to update crawl map", { error });
} }
} else { } else {
// Insert new entry if none exists // Insert new entry if none exists
const { error } = await supabase_service const { error } = await supabase_service.from("crawl_maps").insert({
.from('crawl_maps') origin_url: originUrl,
.insert({ urls: standardizedUrls,
origin_url: sc.originUrl, num_urls: standardizedUrls.length,
urls: visitedUrls, created_at: new Date().toISOString(),
num_urls: visitedUrls.length, updated_at: new Date().toISOString(),
created_at: new Date().toISOString()
}); });
if (error) { if (error) {
@ -126,6 +140,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
_logger.error("Error saving crawl map", { error }); _logger.error("Error saving crawl map", { error });
} }
} }
})();
if (!job.data.v1) { if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id); const jobIDs = await getCrawlJobs(job.data.crawl_id);