From 9a5d40c3cf41bfb1b6c3c0290ab9f7c0755c4f4e Mon Sep 17 00:00:00 2001 From: Micah Stairs Date: Thu, 26 Jun 2025 13:16:42 -0400 Subject: [PATCH] Allow international URLs to pass validation (#1717) --- .../v1/__tests__/urlValidation.test.ts | 5 + apps/api/src/controllers/v1/types.ts | 122 +++++++++--------- 2 files changed, 66 insertions(+), 61 deletions(-) diff --git a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts index afa44e58d..9fe1f02fe 100644 --- a/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts +++ b/apps/api/src/controllers/v1/__tests__/urlValidation.test.ts @@ -76,4 +76,9 @@ describe("URL Schema Validation", () => { it("should reject malformed URLs containing multiple 'http://'", () => { expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); }); + + it("should accept URLs with international domain names", () => { + expect(() => url.parse("http://xn--1lqv92a901a.xn--ses554g/")).not.toThrow(); + }); + }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index ab4e39343..3b3a37d13 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -42,7 +42,7 @@ export const url = z.preprocess( if (!protocolIncluded(x as string)) { x = `http://${x}`; } - + // transforming the query parameters is breaking certain sites, so we're not doing it - mogery // try { // const urlObj = new URL(x as string); @@ -52,7 +52,7 @@ export const url = z.preprocess( // } // } catch (e) { // } - + return x; }, z @@ -61,7 +61,7 @@ export const url = z.preprocess( .regex(/^https?:\/\//, "URL uses unsupported protocol") .refine( (x) => - /\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test( + /\.[a-zA-Z0-9-\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test( x, ), "URL must have a valid top-level domain or be a valid path", @@ -74,7 +74,7 @@ export const url = z.preprocess( return false; } }, "Invalid URL") - // .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), + // .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), ); const strictMessage = @@ -823,11 +823,11 @@ export type ErrorResponse = { export type ScrapeResponse = | ErrorResponse | { - success: true; - warning?: string; - data: Document; - scrape_id?: string; - }; + success: true; + warning?: string; + data: Document; + scrape_id?: string; + }; export interface ScrapeResponseRequestTest { statusCode: number; @@ -878,27 +878,27 @@ export interface ExtractResponseRequestTest { export type CrawlResponse = | ErrorResponse | { - success: true; - id: string; - url: string; - }; + success: true; + id: string; + url: string; + }; export type BatchScrapeResponse = | ErrorResponse | { - success: true; - id: string; - url: string; - invalidURLs?: string[]; - }; + success: true; + id: string; + url: string; + invalidURLs?: string[]; + }; export type MapResponse = | ErrorResponse | { - success: true; - links: string[]; - scrape_id?: string; - }; + success: true; + links: string[]; + scrape_id?: string; + }; export type CrawlStatusParams = { jobId: string; @@ -911,47 +911,47 @@ export type ConcurrencyCheckParams = { export type ConcurrencyCheckResponse = | ErrorResponse | { - success: true; - concurrency: number; - maxConcurrency: number; - }; + success: true; + concurrency: number; + maxConcurrency: number; + }; export type CrawlStatusResponse = | ErrorResponse | { - success: true; - status: "scraping" | "completed" | "failed" | "cancelled"; - completed: number; - total: number; - creditsUsed: number; - expiresAt: string; - next?: string; - data: Document[]; - }; + success: true; + status: "scraping" | "completed" | "failed" | "cancelled"; + completed: number; + total: number; + creditsUsed: number; + expiresAt: string; + next?: string; + data: Document[]; + }; export type OngoingCrawlsResponse = | ErrorResponse | { - success: true; - crawls: { - id: string; - teamId: string; - url: string; - options: CrawlerOptions; - }[]; + success: true; + crawls: { + id: string; + teamId: string; + url: string; + options: CrawlerOptions; + }[]; }; - + export type CrawlErrorsResponse = | ErrorResponse | { - errors: { - id: string; - timestamp?: string; - url: string; - error: string; - }[]; - robotsBlocked: string[]; - }; + errors: { + id: string; + timestamp?: string; + url: string; + error: string; + }[]; + robotsBlocked: string[]; + }; type AuthObject = { team_id: string; @@ -1146,7 +1146,7 @@ export function fromLegacyScrapeOptions( ? ("screenshot@fullPage" as const) : null, extractorOptions !== undefined && - extractorOptions.mode.includes("llm-extraction") + extractorOptions.mode.includes("llm-extraction") ? ("extract" as const) : null, "links", @@ -1170,12 +1170,12 @@ export function fromLegacyScrapeOptions( removeBase64Images: pageOptions.removeBase64Images, extract: extractorOptions !== undefined && - extractorOptions.mode.includes("llm-extraction") + extractorOptions.mode.includes("llm-extraction") ? { - systemPrompt: extractorOptions.extractionPrompt, - prompt: extractorOptions.userPrompt, - schema: extractorOptions.extractionSchema, - } + systemPrompt: extractorOptions.extractionPrompt, + prompt: extractorOptions.userPrompt, + schema: extractorOptions.extractionSchema, + } : undefined, mobile: pageOptions.mobile, fastMode: pageOptions.useFastMode, @@ -1290,10 +1290,10 @@ export type SearchRequestInput = z.input; export type SearchResponse = | ErrorResponse | { - success: true; - warning?: string; - data: Document[]; - }; + success: true; + warning?: string; + data: Document[]; + }; export type TokenUsage = { promptTokens: number;