firecrawl/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
Gergő Móricz 5e760aacbb fixes
2025-06-20 11:28:22 +02:00

204 lines
7.3 KiB
TypeScript

import { load } from "cheerio"; // rustified
import { Document } from "../../../controllers/v1/types";
import { Meta } from "..";
import { extractMetadata as _extractMetadata } from "../../../lib/html-transformer";
export async function extractMetadataRust(
meta: Meta,
html: string,
): Promise<Partial<Document["metadata"]>> {
const fromRust = await _extractMetadata(html);
return {
...fromRust,
...(fromRust.favicon ? {
favicon: new URL(fromRust.favicon, meta.rewrittenUrl ?? meta.url)
} : {}),
scrapeId: meta.id,
};
}
export async function extractMetadata(
meta: Meta,
html: string,
): Promise<Partial<Document["metadata"]>> {
try {
return await extractMetadataRust(meta, html);
} catch (error) {
meta.logger.warn("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractMetadata"
});
}
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let favicon: string | undefined = undefined;
let language: string | undefined = undefined;
let keywords: string | undefined = undefined;
let robots: string | undefined = undefined;
let ogTitle: string | undefined = undefined;
let ogDescription: string | undefined = undefined;
let ogUrl: string | undefined = undefined;
let ogImage: string | undefined = undefined;
let ogAudio: string | undefined = undefined;
let ogDeterminer: string | undefined = undefined;
let ogLocale: string | undefined = undefined;
let ogLocaleAlternate: string[] | undefined = undefined;
let ogSiteName: string | undefined = undefined;
let ogVideo: string | undefined = undefined;
let dcTermsCreated: string | undefined = undefined;
let dcDateCreated: string | undefined = undefined;
let dcDate: string | undefined = undefined;
let dcTermsType: string | undefined = undefined;
let dcType: string | undefined = undefined;
let dcTermsAudience: string | undefined = undefined;
let dcTermsSubject: string | undefined = undefined;
let dcSubject: string | undefined = undefined;
let dcDescription: string | undefined = undefined;
let dcTermsKeywords: string | undefined = undefined;
let modifiedTime: string | undefined = undefined;
let publishedTime: string | undefined = undefined;
let articleTag: string | undefined = undefined;
let articleSection: string | undefined = undefined;
const customMetadata: Record<string, string | string[]> = {};
const soup = load(html);
try {
title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink =
soup('link[rel="icon"]').attr("href") ||
soup('link[rel*="icon"]').first().attr("href") ||
undefined;
if (faviconLink) {
const baseUrl = new URL(meta.rewrittenUrl ?? meta.url).origin;
favicon = faviconLink.startsWith("http")
? faviconLink
: `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern
language = soup("html").attr("lang") || undefined;
keywords = soup('meta[name="keywords"]').attr("content") || undefined;
robots = soup('meta[name="robots"]').attr("content") || undefined;
ogTitle = soup('meta[property="og:title"]').attr("content") || undefined;
ogDescription =
soup('meta[property="og:description"]').attr("content") || undefined;
ogUrl = soup('meta[property="og:url"]').attr("content") || undefined;
ogImage = soup('meta[property="og:image"]').attr("content") || undefined;
ogAudio = soup('meta[property="og:audio"]').attr("content") || undefined;
ogDeterminer =
soup('meta[property="og:determiner"]').attr("content") || undefined;
ogLocale = soup('meta[property="og:locale"]').attr("content") || undefined;
ogLocaleAlternate =
soup('meta[property="og:locale:alternate"]')
.map((i, el) => soup(el).attr("content"))
.get() || undefined;
ogSiteName =
soup('meta[property="og:site_name"]').attr("content") || undefined;
ogVideo = soup('meta[property="og:video"]').attr("content") || undefined;
articleSection =
soup('meta[name="article:section"]').attr("content") || undefined;
articleTag = soup('meta[name="article:tag"]').attr("content") || undefined;
publishedTime =
soup('meta[property="article:published_time"]').attr("content") ||
undefined;
modifiedTime =
soup('meta[property="article:modified_time"]').attr("content") ||
undefined;
dcTermsKeywords =
soup('meta[name="dcterms.keywords"]').attr("content") || undefined;
dcDescription =
soup('meta[name="dc.description"]').attr("content") || undefined;
dcSubject = soup('meta[name="dc.subject"]').attr("content") || undefined;
dcTermsSubject =
soup('meta[name="dcterms.subject"]').attr("content") || undefined;
dcTermsAudience =
soup('meta[name="dcterms.audience"]').attr("content") || undefined;
dcType = soup('meta[name="dc.type"]').attr("content") || undefined;
dcTermsType =
soup('meta[name="dcterms.type"]').attr("content") || undefined;
dcDate = soup('meta[name="dc.date"]').attr("content") || undefined;
dcDateCreated =
soup('meta[name="dc.date.created"]').attr("content") || undefined;
dcTermsCreated =
soup('meta[name="dcterms.created"]').attr("content") || undefined;
try {
// Extract all meta tags for custom metadata
soup("meta").each((i, elem) => {
try {
const name = soup(elem).attr("name") || soup(elem).attr("property") || soup(elem).attr("itemprop");
const content = soup(elem).attr("content");
if (name && content) {
if (name === "description") {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else {
customMetadata[name] = Array.isArray(customMetadata[name])
? [...customMetadata[name] as string[], content].join(", ")
: `${customMetadata[name]}, ${content}`;
}
} else {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content);
} else {
customMetadata[name] = [customMetadata[name] as string, content];
}
}
}
} catch (error) {
meta.logger.error(`Error extracting custom metadata (in)`, { error });
}
});
} catch (error) {
meta.logger.error(`Error extracting custom metadata`, { error });
}
} catch (error) {
meta.logger.error(`Error extracting metadata`, { error });
}
return {
title,
description,
favicon,
language,
keywords,
robots,
ogTitle,
ogDescription,
ogUrl,
ogImage,
ogAudio,
ogDeterminer,
ogLocale,
ogLocaleAlternate,
ogSiteName,
ogVideo,
dcTermsCreated,
dcDateCreated,
dcDate,
dcTermsType,
dcType,
dcTermsAudience,
dcTermsSubject,
dcSubject,
dcDescription,
dcTermsKeywords,
modifiedTime,
publishedTime,
articleTag,
articleSection,
scrapeId: meta.id,
...customMetadata,
};
}