firecrawl/apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts

74 lines
2.4 KiB
TypeScript
Raw Normal View History

import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags";
2024-06-26 21:15:42 -03:00
export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions
) => {
const soup = cheerio.load(html);
2024-06-26 21:02:58 -03:00
if (pageOptions.onlyIncludeTags) {
2024-06-26 21:15:42 -03:00
if (typeof pageOptions.onlyIncludeTags === "string") {
2024-06-26 21:02:58 -03:00
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
2024-06-26 21:15:42 -03:00
if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep
const newRoot = cheerio.load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
2024-06-26 21:02:58 -03:00
});
2024-06-26 21:15:42 -03:00
return newRoot.html();
}
2024-06-26 21:02:58 -03:00
}
soup("script, style, iframe, noscript, meta, head").remove();
2024-06-26 21:15:42 -03:00
if (pageOptions.removeTags) {
2024-06-26 21:15:42 -03:00
if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags];
}
2024-06-26 21:15:42 -03:00
if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) {
2024-06-18 14:36:51 -03:00
let classMatch = false;
2024-06-26 21:15:42 -03:00
const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup("*").filter((i, element) => {
if (element.type === "tag") {
2024-06-18 14:36:51 -03:00
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
2024-06-26 21:15:42 -03:00
const attributesMatch = Object.keys(attributes).some((attr) =>
2024-06-18 14:36:51 -03:00
regexPattern.test(`${attr}="${attributes[attr]}"`)
);
2024-06-26 21:15:42 -03:00
if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) =>
2024-06-18 14:36:51 -03:00
regexPattern.test(`class="${attributes[attr]}"`)
);
}
return tagNameMatches || attributesMatch || classMatch;
}
return false;
});
} else {
elementsToRemove = soup(tag);
}
elementsToRemove.remove();
});
}
}
2024-06-26 21:15:42 -03:00
if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
elementsToRemove.remove();
});
}
const cleanedHtml = soup.html();
return cleanedHtml;
2024-06-26 21:15:42 -03:00
};