mirror of
https://github.com/mendableai/firecrawl.git
synced 2026-02-06 23:39:32 +00:00
361 lines
12 KiB
Rust
361 lines
12 KiB
Rust
use std::{collections::HashMap, ffi::{CStr, CString}};
|
|
|
|
use kuchikiki::{parse_html, traits::TendrilSink};
|
|
use serde::Deserialize;
|
|
use serde_json::Value;
|
|
use url::Url;
|
|
|
|
/// Extracts links from HTML
|
|
///
|
|
/// # Safety
|
|
/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
|
|
#[no_mangle]
|
|
pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
|
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
|
|
|
let document = parse_html().one(html);
|
|
|
|
let mut out: Vec<String> = Vec::new();
|
|
|
|
let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
|
|
for anchor in anchors {
|
|
let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
|
|
|
|
if href.starts_with("http:/") && !href.starts_with("http://") {
|
|
href = format!("http://{}", &href[6..]);
|
|
} else if href.starts_with("https:/") && !href.starts_with("https://") {
|
|
href = format!("https://{}", &href[7..]);
|
|
}
|
|
|
|
out.push(href);
|
|
}
|
|
|
|
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
|
|
}
|
|
|
|
macro_rules! insert_meta_name {
|
|
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
|
|
if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
|
|
$out.insert(($outName).to_string(), Value::String(x));
|
|
}
|
|
};
|
|
}
|
|
|
|
macro_rules! insert_meta_property {
|
|
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
|
|
if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
|
|
$out.insert(($outName).to_string(), Value::String(x));
|
|
}
|
|
};
|
|
}
|
|
|
|
/// Extracts metadata from HTML
|
|
///
|
|
/// # Safety
|
|
/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
|
|
#[no_mangle]
|
|
pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
|
|
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
|
|
|
let document = parse_html().one(html);
|
|
let mut out = HashMap::<String, Value>::new();
|
|
|
|
if let Some(title) = document.select("title").unwrap().next() {
|
|
out.insert("title".to_string(), Value::String(title.text_contents()));
|
|
}
|
|
// insert_meta_name!(out, document, "description", "description");
|
|
|
|
if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
|
|
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))
|
|
.or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next()
|
|
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) {
|
|
out.insert("favicon".to_string(), Value::String(favicon_link));
|
|
}
|
|
|
|
if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) {
|
|
out.insert("language".to_string(), Value::String(lang));
|
|
}
|
|
|
|
// insert_meta_name!(out, document, "keywords", "keywords");
|
|
// insert_meta_name!(out, document, "robots", "robots");
|
|
insert_meta_property!(out, document, "og:title", "ogTitle");
|
|
insert_meta_property!(out, document, "og:description", "ogDescription");
|
|
insert_meta_property!(out, document, "og:url", "ogUrl");
|
|
insert_meta_property!(out, document, "og:image", "ogImage");
|
|
insert_meta_property!(out, document, "og:audio", "ogAudio");
|
|
insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
|
|
insert_meta_property!(out, document, "og:locale", "ogLocale");
|
|
|
|
for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
|
|
let attrs = meta.attributes.borrow();
|
|
|
|
if let Some(content) = attrs.get("content") {
|
|
if let Some(v) = out.get_mut("og:locale:alternate") {
|
|
match v {
|
|
Value::Array(x) => {
|
|
x.push(Value::String(content.to_string()));
|
|
},
|
|
_ => unreachable!(),
|
|
}
|
|
} else {
|
|
out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
|
|
}
|
|
}
|
|
}
|
|
|
|
insert_meta_property!(out, document, "og:site_name", "ogSiteName");
|
|
insert_meta_property!(out, document, "og:video", "ogVideo");
|
|
insert_meta_name!(out, document, "article:section", "articleSection");
|
|
insert_meta_name!(out, document, "article:tag", "articleTag");
|
|
insert_meta_property!(out, document, "article:published_time", "publishedTime");
|
|
insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
|
|
insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
|
|
insert_meta_name!(out, document, "dc.description", "dcDescription");
|
|
insert_meta_name!(out, document, "dc.subject", "dcSubject");
|
|
insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
|
|
insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
|
|
insert_meta_name!(out, document, "dc.type", "dcType");
|
|
insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
|
|
insert_meta_name!(out, document, "dc.date", "dcDate");
|
|
insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
|
|
insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");
|
|
|
|
for meta in document.select("meta").unwrap() {
|
|
let meta = meta.as_node().as_element().unwrap();
|
|
let attrs = meta.attributes.borrow();
|
|
|
|
if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
|
|
if let Some(content) = attrs.get("content") {
|
|
if let Some(v) = out.get(name) {
|
|
match v {
|
|
Value::String(_) => {
|
|
if name != "title" { // preserve title tag in metadata
|
|
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
|
|
}
|
|
},
|
|
Value::Array(_) => {
|
|
match out.get_mut(name) {
|
|
Some(Value::Array(x)) => {
|
|
x.push(Value::String(content.to_string()));
|
|
},
|
|
_ => unreachable!(),
|
|
}
|
|
},
|
|
_ => unreachable!(),
|
|
}
|
|
} else {
|
|
out.insert(name.to_string(), Value::String(content.to_string()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
|
|
}
|
|
|
|
const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
|
|
"header",
|
|
"footer",
|
|
"nav",
|
|
"aside",
|
|
".header",
|
|
".top",
|
|
".navbar",
|
|
"#header",
|
|
".footer",
|
|
".bottom",
|
|
"#footer",
|
|
".sidebar",
|
|
".side",
|
|
".aside",
|
|
"#sidebar",
|
|
".modal",
|
|
".popup",
|
|
"#modal",
|
|
".overlay",
|
|
".ad",
|
|
".ads",
|
|
".advert",
|
|
"#ad",
|
|
".lang-selector",
|
|
".language",
|
|
"#language-selector",
|
|
".social",
|
|
".social-media",
|
|
".social-links",
|
|
"#social",
|
|
".menu",
|
|
".navigation",
|
|
"#nav",
|
|
".breadcrumbs",
|
|
"#breadcrumbs",
|
|
".share",
|
|
"#share",
|
|
".widget",
|
|
"#widget",
|
|
".cookie",
|
|
"#cookie",
|
|
];
|
|
|
|
const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [
|
|
"#main"
|
|
];
|
|
|
|
#[derive(Deserialize)]
|
|
struct TranformHTMLOptions {
|
|
html: String,
|
|
url: String,
|
|
include_tags: Vec<String>,
|
|
exclude_tags: Vec<String>,
|
|
only_main_content: bool,
|
|
}
|
|
|
|
struct ImageSource {
|
|
url: String,
|
|
size: i32,
|
|
is_x: bool,
|
|
}
|
|
|
|
fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
|
|
let mut document = parse_html().one(opts.html);
|
|
|
|
if !opts.include_tags.is_empty() {
|
|
let new_document = parse_html().one("<div></div>");
|
|
let root = new_document.select_first("div")?;
|
|
|
|
for x in opts.include_tags.iter() {
|
|
let matching_nodes: Vec<_> = document.select(x)?.collect();
|
|
for tag in matching_nodes {
|
|
root.as_node().append(tag.as_node().clone());
|
|
}
|
|
}
|
|
|
|
document = new_document;
|
|
}
|
|
|
|
while let Ok(x) = document.select_first("head") {
|
|
x.as_node().detach();
|
|
}
|
|
|
|
while let Ok(x) = document.select_first("meta") {
|
|
x.as_node().detach();
|
|
}
|
|
|
|
while let Ok(x) = document.select_first("noscript") {
|
|
x.as_node().detach();
|
|
}
|
|
|
|
while let Ok(x) = document.select_first("style") {
|
|
x.as_node().detach();
|
|
}
|
|
|
|
while let Ok(x) = document.select_first("script") {
|
|
x.as_node().detach();
|
|
}
|
|
|
|
for x in opts.exclude_tags.iter() {
|
|
// TODO: implement weird version
|
|
while let Ok(x) = document.select_first(x) {
|
|
x.as_node().detach();
|
|
}
|
|
}
|
|
|
|
if opts.only_main_content {
|
|
for x in EXCLUDE_NON_MAIN_TAGS.iter() {
|
|
let x: Vec<_> = document.select(x)?.collect();
|
|
for tag in x {
|
|
if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(x).is_ok_and(|mut x| x.next().is_some())) {
|
|
tag.as_node().detach();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let srcset_images: Vec<_> = document.select("img[srcset]")?.collect();
|
|
for img in srcset_images {
|
|
let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.split(",").filter_map(|x| {
|
|
let tok: Vec<&str> = x.trim().split(" ").collect();
|
|
let tok_1 = if tok.len() > 1 && !tok[1].is_empty() {
|
|
tok[1]
|
|
} else {
|
|
"1x"
|
|
};
|
|
if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
|
|
Some(ImageSource {
|
|
url: tok[0].to_string(),
|
|
size: parsed_size,
|
|
is_x: tok_1.ends_with("x")
|
|
})
|
|
} else {
|
|
None
|
|
}
|
|
}).collect();
|
|
|
|
if sizes.iter().all(|x| x.is_x) {
|
|
if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) {
|
|
sizes.push(ImageSource {
|
|
url: src,
|
|
size: 1,
|
|
is_x: true,
|
|
});
|
|
}
|
|
}
|
|
|
|
sizes.sort_by(|a, b| b.size.cmp(&a.size));
|
|
|
|
if let Some(biggest) = sizes.first() {
|
|
img.attributes.borrow_mut().insert("src", biggest.url.clone());
|
|
}
|
|
}
|
|
|
|
let url = Url::parse(&opts.url).map_err(|_| ())?;
|
|
|
|
let src_images: Vec<_> = document.select("img[src]")?.collect();
|
|
for img in src_images {
|
|
let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?;
|
|
if let Ok(new) = url.join(&old) {
|
|
img.attributes.borrow_mut().insert("src", new.to_string());
|
|
}
|
|
}
|
|
|
|
let href_anchors: Vec<_> = document.select("a[href]")?.collect();
|
|
for anchor in href_anchors {
|
|
let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?;
|
|
if let Ok(new) = url.join(&old) {
|
|
anchor.attributes.borrow_mut().insert("href", new.to_string());
|
|
}
|
|
}
|
|
|
|
Ok(document.to_string())
|
|
}
|
|
|
|
/// Transforms rawHtml to html (formerly removeUnwantedElements)
|
|
///
|
|
/// # Safety
|
|
/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
|
|
#[no_mangle]
|
|
pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
|
|
let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
|
|
Ok(x) => x,
|
|
Err(_) => {
|
|
return CString::new("RUSTFC:ERROR").unwrap().into_raw();
|
|
}
|
|
};
|
|
|
|
let out = match _transform_html_inner(opts) {
|
|
Ok(x) => x,
|
|
Err(_) => "RUSTFC:ERROR".to_string(),
|
|
};
|
|
|
|
CString::new(out).unwrap().into_raw()
|
|
}
|
|
|
|
/// Frees a string allocated in Rust-land.
|
|
///
|
|
/// # Safety
|
|
/// ptr must be a non-freed string pointer returned by Rust code.
|
|
#[no_mangle]
|
|
pub unsafe extern "C" fn free_string(ptr: *mut i8) {
|
|
drop(unsafe { CString::from_raw(ptr) })
|
|
}
|