mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
Allow international URLs to pass validation (#1717)
This commit is contained in:
parent
1919799bed
commit
9a5d40c3cf
@ -76,4 +76,9 @@ describe("URL Schema Validation", () => {
|
|||||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||||
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should accept URLs with international domain names", () => {
|
||||||
|
expect(() => url.parse("http://xn--1lqv92a901a.xn--ses554g/")).not.toThrow();
|
||||||
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
|
@ -42,7 +42,7 @@ export const url = z.preprocess(
|
|||||||
if (!protocolIncluded(x as string)) {
|
if (!protocolIncluded(x as string)) {
|
||||||
x = `http://${x}`;
|
x = `http://${x}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// transforming the query parameters is breaking certain sites, so we're not doing it - mogery
|
// transforming the query parameters is breaking certain sites, so we're not doing it - mogery
|
||||||
// try {
|
// try {
|
||||||
// const urlObj = new URL(x as string);
|
// const urlObj = new URL(x as string);
|
||||||
@ -52,7 +52,7 @@ export const url = z.preprocess(
|
|||||||
// }
|
// }
|
||||||
// } catch (e) {
|
// } catch (e) {
|
||||||
// }
|
// }
|
||||||
|
|
||||||
return x;
|
return x;
|
||||||
},
|
},
|
||||||
z
|
z
|
||||||
@ -61,7 +61,7 @@ export const url = z.preprocess(
|
|||||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||||
.refine(
|
.refine(
|
||||||
(x) =>
|
(x) =>
|
||||||
/\.[a-zA-Z\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
|
/\.[a-zA-Z0-9-\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]{2,}(:\d+)?([\/?#]|$)/i.test(
|
||||||
x,
|
x,
|
||||||
),
|
),
|
||||||
"URL must have a valid top-level domain or be a valid path",
|
"URL must have a valid top-level domain or be a valid path",
|
||||||
@ -74,7 +74,7 @@ export const url = z.preprocess(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}, "Invalid URL")
|
}, "Invalid URL")
|
||||||
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
||||||
);
|
);
|
||||||
|
|
||||||
const strictMessage =
|
const strictMessage =
|
||||||
@ -823,11 +823,11 @@ export type ErrorResponse = {
|
|||||||
export type ScrapeResponse =
|
export type ScrapeResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
data: Document;
|
data: Document;
|
||||||
scrape_id?: string;
|
scrape_id?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export interface ScrapeResponseRequestTest {
|
export interface ScrapeResponseRequestTest {
|
||||||
statusCode: number;
|
statusCode: number;
|
||||||
@ -878,27 +878,27 @@ export interface ExtractResponseRequestTest {
|
|||||||
export type CrawlResponse =
|
export type CrawlResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
id: string;
|
id: string;
|
||||||
url: string;
|
url: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type BatchScrapeResponse =
|
export type BatchScrapeResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
id: string;
|
id: string;
|
||||||
url: string;
|
url: string;
|
||||||
invalidURLs?: string[];
|
invalidURLs?: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type MapResponse =
|
export type MapResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
links: string[];
|
links: string[];
|
||||||
scrape_id?: string;
|
scrape_id?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type CrawlStatusParams = {
|
export type CrawlStatusParams = {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
@ -911,47 +911,47 @@ export type ConcurrencyCheckParams = {
|
|||||||
export type ConcurrencyCheckResponse =
|
export type ConcurrencyCheckResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
concurrency: number;
|
concurrency: number;
|
||||||
maxConcurrency: number;
|
maxConcurrency: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type CrawlStatusResponse =
|
export type CrawlStatusResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||||
completed: number;
|
completed: number;
|
||||||
total: number;
|
total: number;
|
||||||
creditsUsed: number;
|
creditsUsed: number;
|
||||||
expiresAt: string;
|
expiresAt: string;
|
||||||
next?: string;
|
next?: string;
|
||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type OngoingCrawlsResponse =
|
export type OngoingCrawlsResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
crawls: {
|
crawls: {
|
||||||
id: string;
|
id: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
url: string;
|
url: string;
|
||||||
options: CrawlerOptions;
|
options: CrawlerOptions;
|
||||||
}[];
|
}[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type CrawlErrorsResponse =
|
export type CrawlErrorsResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
errors: {
|
errors: {
|
||||||
id: string;
|
id: string;
|
||||||
timestamp?: string;
|
timestamp?: string;
|
||||||
url: string;
|
url: string;
|
||||||
error: string;
|
error: string;
|
||||||
}[];
|
}[];
|
||||||
robotsBlocked: string[];
|
robotsBlocked: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
@ -1146,7 +1146,7 @@ export function fromLegacyScrapeOptions(
|
|||||||
? ("screenshot@fullPage" as const)
|
? ("screenshot@fullPage" as const)
|
||||||
: null,
|
: null,
|
||||||
extractorOptions !== undefined &&
|
extractorOptions !== undefined &&
|
||||||
extractorOptions.mode.includes("llm-extraction")
|
extractorOptions.mode.includes("llm-extraction")
|
||||||
? ("extract" as const)
|
? ("extract" as const)
|
||||||
: null,
|
: null,
|
||||||
"links",
|
"links",
|
||||||
@ -1170,12 +1170,12 @@ export function fromLegacyScrapeOptions(
|
|||||||
removeBase64Images: pageOptions.removeBase64Images,
|
removeBase64Images: pageOptions.removeBase64Images,
|
||||||
extract:
|
extract:
|
||||||
extractorOptions !== undefined &&
|
extractorOptions !== undefined &&
|
||||||
extractorOptions.mode.includes("llm-extraction")
|
extractorOptions.mode.includes("llm-extraction")
|
||||||
? {
|
? {
|
||||||
systemPrompt: extractorOptions.extractionPrompt,
|
systemPrompt: extractorOptions.extractionPrompt,
|
||||||
prompt: extractorOptions.userPrompt,
|
prompt: extractorOptions.userPrompt,
|
||||||
schema: extractorOptions.extractionSchema,
|
schema: extractorOptions.extractionSchema,
|
||||||
}
|
}
|
||||||
: undefined,
|
: undefined,
|
||||||
mobile: pageOptions.mobile,
|
mobile: pageOptions.mobile,
|
||||||
fastMode: pageOptions.useFastMode,
|
fastMode: pageOptions.useFastMode,
|
||||||
@ -1290,10 +1290,10 @@ export type SearchRequestInput = z.input<typeof searchRequestSchema>;
|
|||||||
export type SearchResponse =
|
export type SearchResponse =
|
||||||
| ErrorResponse
|
| ErrorResponse
|
||||||
| {
|
| {
|
||||||
success: true;
|
success: true;
|
||||||
warning?: string;
|
warning?: string;
|
||||||
data: Document[];
|
data: Document[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type TokenUsage = {
|
export type TokenUsage = {
|
||||||
promptTokens: number;
|
promptTokens: number;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user