mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-30 00:37:31 +00:00
feat(crawl): ensure url trimming
This commit is contained in:
parent
977a3e13c5
commit
363021ea78
@ -60,7 +60,7 @@ export async function getLinksFromSitemap(
|
||||
// Handle sitemap index files
|
||||
const sitemapUrls = root.sitemap
|
||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||
.map((sitemap) => sitemap.loc[0]);
|
||||
.map((sitemap) => sitemap.loc[0].trim());
|
||||
|
||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||
getLinksFromSitemap(
|
||||
@ -78,9 +78,9 @@ export async function getLinksFromSitemap(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
url.loc[0].toLowerCase().endsWith('.xml')
|
||||
url.loc[0].trim().toLowerCase().endsWith('.xml')
|
||||
)
|
||||
.map((url) => url.loc[0]);
|
||||
.map((url) => url.loc[0].trim());
|
||||
|
||||
if (xmlSitemaps.length > 0) {
|
||||
// Recursively fetch links from additional sitemaps
|
||||
@ -98,10 +98,10 @@ export async function getLinksFromSitemap(
|
||||
(url) =>
|
||||
url.loc &&
|
||||
url.loc.length > 0 &&
|
||||
!url.loc[0].toLowerCase().endsWith('.xml') &&
|
||||
!WebCrawler.prototype.isFile(url.loc[0]),
|
||||
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
|
||||
!WebCrawler.prototype.isFile(url.loc[0].trim()),
|
||||
)
|
||||
.map((url) => url.loc[0]);
|
||||
.map((url) => url.loc[0].trim());
|
||||
count += validUrls.length;
|
||||
|
||||
const h = urlsHandler(validUrls);
|
||||
|
||||
@ -7,8 +7,9 @@ export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const links: string[] = [];
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
let href = $(element).attr("href");
|
||||
if (href) {
|
||||
href = href.trim();
|
||||
try {
|
||||
if (href.startsWith("http://") || href.startsWith("https://")) {
|
||||
// Absolute URL, add as is
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user