Merge pull request #869 from mendableai/fix/new-url-on-utils-extract-links

[BUG] Added trycatch and removed redundancy
This commit is contained in:
Nicolas 2024-11-05 11:26:26 -05:00 committed by GitHub
commit ae5ba74e2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html); const $ = cheerio.load(html);
const links: string[] = []; const links: string[] = [];
// Parse the base URL to get the origin
const urlObject = new URL(baseUrl);
const origin = urlObject.origin;
$('a').each((_, element) => { $('a').each((_, element) => {
const href = $(element).attr('href'); const href = $(element).attr('href');
if (href) { if (href) {
if (href.startsWith('http://') || href.startsWith('https://')) { try {
// Absolute URL, add as is if (href.startsWith('http://') || href.startsWith('https://')) {
links.push(href); // Absolute URL, add as is
} else if (href.startsWith('/')) { links.push(href);
// Relative URL starting with '/', append to origin } else if (href.startsWith('/')) {
links.push(new URL(href, baseUrl).href); // Relative URL starting with '/', append to base URL
} else if (!href.startsWith('#') && !href.startsWith('mailto:')) { links.push(new URL(href, baseUrl).href);
// Relative URL not starting with '/', append to base URL } else if (!href.startsWith('#') && !href.startsWith('mailto:')) {
links.push(new URL(href, baseUrl).href); // Relative URL not starting with '/', append to base URL
} else if (href.startsWith('mailto:')) { links.push(new URL(href, baseUrl).href);
// mailto: links, add as is } else if (href.startsWith('mailto:')) {
links.push(href); // mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
// Log the error and continue
console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error);
} }
// Fragment-only links (#) are ignored
} }
}); });