mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-29 16:16:46 +00:00
Fix: Concatenate metadata arrays into strings with exceptions (#1574)
* Fix: Concatenate metadata arrays into strings except for ogLocaleAlternate Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> * Fix: Only concatenate description field, preserve other metadata arrays Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> * Fix: Only concatenate description field, keep other metadata fields in original format Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nicolas Camara <nicolascamara29@gmail.com>
This commit is contained in:
parent
f838190ba6
commit
a5a915d639
@ -128,17 +128,30 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib
|
||||
if let Some(content) = attrs.get("content") {
|
||||
if let Some(v) = out.get(name) {
|
||||
match v {
|
||||
Value::String(_) => {
|
||||
if name != "title" { // preserve title tag in metadata
|
||||
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
|
||||
Value::String(existing) => {
|
||||
if name == "description" {
|
||||
out.insert(name.to_string(), Value::String(format!("{}, {}", existing, content)));
|
||||
} else if name != "title" { // preserve title tag in metadata
|
||||
out.insert(name.to_string(), Value::Array(vec! [Value::String(existing.clone()), Value::String(content.to_string())]));
|
||||
}
|
||||
},
|
||||
Value::Array(_) => {
|
||||
match out.get_mut(name) {
|
||||
Some(Value::Array(x)) => {
|
||||
x.push(Value::String(content.to_string()));
|
||||
},
|
||||
_ => unreachable!(),
|
||||
Value::Array(existing_array) => {
|
||||
if name == "description" {
|
||||
let mut values: Vec<String> = existing_array.iter()
|
||||
.filter_map(|v| match v {
|
||||
Value::String(s) => Some(s.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
values.push(content.to_string());
|
||||
out.insert(name.to_string(), Value::String(values.join(", ")));
|
||||
} else {
|
||||
match out.get_mut(name) {
|
||||
Some(Value::Array(x)) => {
|
||||
x.push(Value::String(content.to_string()));
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
},
|
||||
_ => unreachable!(),
|
||||
|
||||
44
apps/api/src/__tests__/snips/metadata-concat.test.ts
Normal file
44
apps/api/src/__tests__/snips/metadata-concat.test.ts
Normal file
@ -0,0 +1,44 @@
|
||||
import { extractMetadata } from "../../scraper/scrapeURL/lib/extractMetadata";
|
||||
import { jest, describe, it, expect } from "@jest/globals";
|
||||
|
||||
describe("Metadata concatenation", () => {
|
||||
it("should concatenate description field into a string while preserving arrays for other metadata fields", async () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="description" content="First description">
|
||||
<meta name="description" content="Second description">
|
||||
<meta property="og:locale:alternate" content="en_US">
|
||||
<meta property="og:locale:alternate" content="fr_FR">
|
||||
<meta name="keywords" content="first keyword">
|
||||
<meta name="keywords" content="second keyword">
|
||||
</head>
|
||||
<body></body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
const meta: any = {
|
||||
url: "https://example.com",
|
||||
id: "test-id",
|
||||
logger: {
|
||||
warn: jest.fn(),
|
||||
error: jest.fn()
|
||||
}
|
||||
};
|
||||
|
||||
const metadata = await extractMetadata(meta, html);
|
||||
|
||||
expect(metadata.description).toBeDefined();
|
||||
expect(Array.isArray(metadata.description)).toBe(false);
|
||||
expect(typeof metadata.description).toBe("string");
|
||||
expect(metadata.description).toBe("First description, Second description");
|
||||
|
||||
expect(metadata.ogLocaleAlternate).toBeDefined();
|
||||
expect(Array.isArray(metadata.ogLocaleAlternate)).toBe(true);
|
||||
expect(metadata.ogLocaleAlternate).toEqual(["en_US", "fr_FR"]);
|
||||
|
||||
expect(metadata.keywords).toBeDefined();
|
||||
expect(Array.isArray(metadata.keywords)).toBe(true);
|
||||
expect(metadata.keywords).toEqual(["first keyword", "second keyword"]);
|
||||
});
|
||||
});
|
||||
@ -137,12 +137,22 @@ export async function extractMetadata(
|
||||
const content = soup(elem).attr("content");
|
||||
|
||||
if (name && content) {
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else if (Array.isArray(customMetadata[name])) {
|
||||
(customMetadata[name] as string[]).push(content);
|
||||
if (name === "description") {
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else {
|
||||
customMetadata[name] = Array.isArray(customMetadata[name])
|
||||
? [...customMetadata[name] as string[], content].join(", ")
|
||||
: `${customMetadata[name]}, ${content}`;
|
||||
}
|
||||
} else {
|
||||
customMetadata[name] = [customMetadata[name] as string, content];
|
||||
if (customMetadata[name] === undefined) {
|
||||
customMetadata[name] = content;
|
||||
} else if (Array.isArray(customMetadata[name])) {
|
||||
(customMetadata[name] as string[]).push(content);
|
||||
} else {
|
||||
customMetadata[name] = [customMetadata[name] as string, content];
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user