feat(crawl): ensure url trimming

This commit is contained in:
Móricz Gergő 2025-01-08 12:35:42 +01:00
parent 977a3e13c5
commit 363021ea78
2 changed files with 8 additions and 7 deletions

View File

@ -60,7 +60,7 @@ export async function getLinksFromSitemap(
// Handle sitemap index files
const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) => sitemap.loc[0]);
.map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
@ -78,9 +78,9 @@ export async function getLinksFromSitemap(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml')
url.loc[0].trim().toLowerCase().endsWith('.xml')
)
.map((url) => url.loc[0]);
.map((url) => url.loc[0].trim());
if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
@ -98,10 +98,10 @@ export async function getLinksFromSitemap(
(url) =>
url.loc &&
url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]),
!url.loc[0].trim().toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0].trim()),
)
.map((url) => url.loc[0]);
.map((url) => url.loc[0].trim());
count += validUrls.length;
const h = urlsHandler(validUrls);

View File

@ -7,8 +7,9 @@ export function extractLinks(html: string, baseUrl: string): string[] {
const links: string[] = [];
$("a").each((_, element) => {
const href = $(element).attr("href");
let href = $(element).attr("href");
if (href) {
href = href.trim();
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is