Fix: Concatenate metadata arrays into strings with exceptions (#1574)

* Fix: Concatenate metadata arrays into strings except for ogLocaleAlternate

Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com>

* Fix: Only concatenate description field, preserve other metadata arrays

Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com>

* Fix: Only concatenate description field, keep other metadata fields in original format

Co-Authored-By: Nicolas Camara <nicolascamara29@gmail.com>

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Nicolas Camara <nicolascamara29@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-05-20 12:40:53 -03:00 committed by GitHub
parent f838190ba6
commit a5a915d639
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 81 additions and 14 deletions

View File

@ -128,17 +128,30 @@ pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut lib
if let Some(content) = attrs.get("content") {
if let Some(v) = out.get(name) {
match v {
Value::String(_) => {
if name != "title" { // preserve title tag in metadata
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
Value::String(existing) => {
if name == "description" {
out.insert(name.to_string(), Value::String(format!("{}, {}", existing, content)));
} else if name != "title" { // preserve title tag in metadata
out.insert(name.to_string(), Value::Array(vec! [Value::String(existing.clone()), Value::String(content.to_string())]));
}
},
Value::Array(_) => {
match out.get_mut(name) {
Some(Value::Array(x)) => {
x.push(Value::String(content.to_string()));
},
_ => unreachable!(),
Value::Array(existing_array) => {
if name == "description" {
let mut values: Vec<String> = existing_array.iter()
.filter_map(|v| match v {
Value::String(s) => Some(s.clone()),
_ => None,
})
.collect();
values.push(content.to_string());
out.insert(name.to_string(), Value::String(values.join(", ")));
} else {
match out.get_mut(name) {
Some(Value::Array(x)) => {
x.push(Value::String(content.to_string()));
},
_ => unreachable!(),
}
}
},
_ => unreachable!(),

View File

@ -0,0 +1,44 @@
import { extractMetadata } from "../../scraper/scrapeURL/lib/extractMetadata";
import { jest, describe, it, expect } from "@jest/globals";
describe("Metadata concatenation", () => {
it("should concatenate description field into a string while preserving arrays for other metadata fields", async () => {
const html = `
<html>
<head>
<meta name="description" content="First description">
<meta name="description" content="Second description">
<meta property="og:locale:alternate" content="en_US">
<meta property="og:locale:alternate" content="fr_FR">
<meta name="keywords" content="first keyword">
<meta name="keywords" content="second keyword">
</head>
<body></body>
</html>
`;
const meta: any = {
url: "https://example.com",
id: "test-id",
logger: {
warn: jest.fn(),
error: jest.fn()
}
};
const metadata = await extractMetadata(meta, html);
expect(metadata.description).toBeDefined();
expect(Array.isArray(metadata.description)).toBe(false);
expect(typeof metadata.description).toBe("string");
expect(metadata.description).toBe("First description, Second description");
expect(metadata.ogLocaleAlternate).toBeDefined();
expect(Array.isArray(metadata.ogLocaleAlternate)).toBe(true);
expect(metadata.ogLocaleAlternate).toEqual(["en_US", "fr_FR"]);
expect(metadata.keywords).toBeDefined();
expect(Array.isArray(metadata.keywords)).toBe(true);
expect(metadata.keywords).toEqual(["first keyword", "second keyword"]);
});
});

View File

@ -137,12 +137,22 @@ export async function extractMetadata(
const content = soup(elem).attr("content");
if (name && content) {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content);
if (name === "description") {
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else {
customMetadata[name] = Array.isArray(customMetadata[name])
? [...customMetadata[name] as string[], content].join(", ")
: `${customMetadata[name]}, ${content}`;
}
} else {
customMetadata[name] = [customMetadata[name] as string, content];
if (customMetadata[name] === undefined) {
customMetadata[name] = content;
} else if (Array.isArray(customMetadata[name])) {
(customMetadata[name] as string[]).push(content);
} else {
customMetadata[name] = [customMetadata[name] as string, content];
}
}
}
} catch (error) {