Allow international URLs to pass validation (#1717)

This commit is contained in:
Micah Stairs 2025-06-26 13:16:42 -04:00 committed by GitHub
parent 1919799bed
commit 9a5d40c3cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 66 additions and 61 deletions

View File

@ -76,4 +76,9 @@ describe("URL Schema Validation", () => {
it("should reject malformed URLs containing multiple 'http://'", () => { it("should reject malformed URLs containing multiple 'http://'", () => {
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
}); });
it("should accept URLs with international domain names", () => {
expect(() => url.parse("http://xn--1lqv92a901a.xn--ses554g/")).not.toThrow();
});
}); });

View File

@ -42,7 +42,7 @@ export const url = z.preprocess(
if (!protocolIncluded(x as string)) { if (!protocolIncluded(x as string)) {
x = `http://${x}`; x = `http://${x}`;
} }
// transforming the query parameters is breaking certain sites, so we're not doing it - mogery // transforming the query parameters is breaking certain sites, so we're not doing it - mogery
// try { // try {
// const urlObj = new URL(x as string); // const urlObj = new URL(x as string);
@ -52,7 +52,7 @@ export const url = z.preprocess(
// } // }
// } catch (e) { // } catch (e) {
// } // }
return x; return x;
}, },
z z
@ -61,7 +61,7 @@ export const url = z.preprocess(
.regex(/^https?:\/\//, "URL uses unsupported protocol") .regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine( .refine(
(x) => (x) =>
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test( /\.[a-zA-Z0-9-\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
x, x,
), ),
"URL must have a valid top-level domain or be a valid path", "URL must have a valid top-level domain or be a valid path",
@ -74,7 +74,7 @@ export const url = z.preprocess(
return false; return false;
} }
}, "Invalid URL") }, "Invalid URL")
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), // .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
); );
const strictMessage = const strictMessage =
@ -823,11 +823,11 @@ export type ErrorResponse = {
export type ScrapeResponse = export type ScrapeResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
warning?: string; warning?: string;
data: Document; data: Document;
scrape_id?: string; scrape_id?: string;
}; };
export interface ScrapeResponseRequestTest { export interface ScrapeResponseRequestTest {
statusCode: number; statusCode: number;
@ -878,27 +878,27 @@ export interface ExtractResponseRequestTest {
export type CrawlResponse = export type CrawlResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
id: string; id: string;
url: string; url: string;
}; };
export type BatchScrapeResponse = export type BatchScrapeResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
id: string; id: string;
url: string; url: string;
invalidURLs?: string[]; invalidURLs?: string[];
}; };
export type MapResponse = export type MapResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
links: string[]; links: string[];
scrape_id?: string; scrape_id?: string;
}; };
export type CrawlStatusParams = { export type CrawlStatusParams = {
jobId: string; jobId: string;
@ -911,47 +911,47 @@ export type ConcurrencyCheckParams = {
export type ConcurrencyCheckResponse = export type ConcurrencyCheckResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
concurrency: number; concurrency: number;
maxConcurrency: number; maxConcurrency: number;
}; };
export type CrawlStatusResponse = export type CrawlStatusResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
status: "scraping" | "completed" | "failed" | "cancelled"; status: "scraping" | "completed" | "failed" | "cancelled";
completed: number; completed: number;
total: number; total: number;
creditsUsed: number; creditsUsed: number;
expiresAt: string; expiresAt: string;
next?: string; next?: string;
data: Document[]; data: Document[];
}; };
export type OngoingCrawlsResponse = export type OngoingCrawlsResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
crawls: { crawls: {
id: string; id: string;
teamId: string; teamId: string;
url: string; url: string;
options: CrawlerOptions; options: CrawlerOptions;
}[]; }[];
}; };
export type CrawlErrorsResponse = export type CrawlErrorsResponse =
| ErrorResponse | ErrorResponse
| { | {
errors: { errors: {
id: string; id: string;
timestamp?: string; timestamp?: string;
url: string; url: string;
error: string; error: string;
}[]; }[];
robotsBlocked: string[]; robotsBlocked: string[];
}; };
type AuthObject = { type AuthObject = {
team_id: string; team_id: string;
@ -1146,7 +1146,7 @@ export function fromLegacyScrapeOptions(
? ("screenshot@fullPage" as const) ? ("screenshot@fullPage" as const)
: null, : null,
extractorOptions !== undefined && extractorOptions !== undefined &&
extractorOptions.mode.includes("llm-extraction") extractorOptions.mode.includes("llm-extraction")
? ("extract" as const) ? ("extract" as const)
: null, : null,
"links", "links",
@ -1170,12 +1170,12 @@ export function fromLegacyScrapeOptions(
removeBase64Images: pageOptions.removeBase64Images, removeBase64Images: pageOptions.removeBase64Images,
extract: extract:
extractorOptions !== undefined && extractorOptions !== undefined &&
extractorOptions.mode.includes("llm-extraction") extractorOptions.mode.includes("llm-extraction")
? { ? {
systemPrompt: extractorOptions.extractionPrompt, systemPrompt: extractorOptions.extractionPrompt,
prompt: extractorOptions.userPrompt, prompt: extractorOptions.userPrompt,
schema: extractorOptions.extractionSchema, schema: extractorOptions.extractionSchema,
} }
: undefined, : undefined,
mobile: pageOptions.mobile, mobile: pageOptions.mobile,
fastMode: pageOptions.useFastMode, fastMode: pageOptions.useFastMode,
@ -1290,10 +1290,10 @@ export type SearchRequestInput = z.input<typeof searchRequestSchema>;
export type SearchResponse = export type SearchResponse =
| ErrorResponse | ErrorResponse
| { | {
success: true; success: true;
warning?: string; warning?: string;
data: Document[]; data: Document[];
}; };
export type TokenUsage = { export type TokenUsage = {
promptTokens: number; promptTokens: number;