Nick: fixed

2025-09-14 19:19:48 +00:00 · 2025-01-03 22:50:53 -03:00 · 2025-01-03 22:50:53 -03:00 · c655c6859f
commit c655c6859f
parent a4f7c38834
2 changed files with 63 additions and 41 deletions
--- a/apps/api/src/lib/canonical-url.ts
+++ b/apps/api/src/lib/canonical-url.ts
@ -0,0 +1,7 @@
 export function normalizeUrl(url: string) {
  url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
  if (url.endsWith("/")) {
    url = url.slice(0, -1);
  }
  return url;
 }
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { indexPage } from "../lib/extract/index/pinecone";
 import { Document } from "../controllers/v1/types";
 import { supabase_service } from "../services/supabase";
 import { normalizeUrl } from "../lib/canonical-url";
 configDotenv();
@ -78,44 +79,57 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
  if (await finishCrawl(job.data.crawl_id)) {
    (async () => {
      const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
      // Get all visited URLs from Redis
-    const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");
+      const visitedUrls = await redisConnection.smembers(
-    
+        "crawl:" + job.data.crawl_id + ":visited",
      );
      // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
-    if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
+      if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
        // Fire and forget the upload to Supabase
        try {
          // Standardize URLs to canonical form (https, no www)
          const standardizedUrls = [
            ...new Set(
              visitedUrls.map((url) => {
                return normalizeUrl(url);
              }),
            ),
          ];
          // First check if entry exists for this origin URL
          const { data: existingMap } = await supabase_service
-          .from('crawl_maps')
+            .from("crawl_maps")
-          .select('urls')
+            .select("urls")
-          .eq('origin_url', sc.originUrl)
+            .eq("origin_url", originUrl)
            .single();
          if (existingMap) {
            // Merge URLs, removing duplicates
-          const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];
+            const mergedUrls = [
              ...new Set([...existingMap.urls, ...standardizedUrls]),
            ];
            const { error } = await supabase_service
-            .from('crawl_maps')
+              .from("crawl_maps")
              .update({
                urls: mergedUrls,
                num_urls: mergedUrls.length,
-              updated_at: new Date().toISOString()
+                updated_at: new Date().toISOString(),
              })
-            .eq('origin_url', sc.originUrl);
+              .eq("origin_url", originUrl);
            if (error) {
              _logger.error("Failed to update crawl map", { error });
            }
          } else {
            // Insert new entry if none exists
-          const { error } = await supabase_service
+            const { error } = await supabase_service.from("crawl_maps").insert({
-            .from('crawl_maps')
+              origin_url: originUrl,
-            .insert({
+              urls: standardizedUrls,
-              origin_url: sc.originUrl,
+              num_urls: standardizedUrls.length,
-              urls: visitedUrls,
+              created_at: new Date().toISOString(),
-              num_urls: visitedUrls.length,
+              updated_at: new Date().toISOString(),
              created_at: new Date().toISOString()
            });
            if (error) {
@ -126,6 +140,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
          _logger.error("Error saving crawl map", { error });
        }
      }
    })();
    if (!job.data.v1) {
      const jobIDs = await getCrawlJobs(job.data.crawl_id);