Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/extract/html-cleaner.ts
1import { JSDOM } from "jsdom";23export interface CleanHtmlOptions {4removeAds?: boolean;5removeBase64Images?: boolean;6onlyMainContent?: boolean;7includeSelectors?: string[];8excludeSelectors?: string[];9}1011const ALWAYS_REMOVE_SELECTORS = [12"script",13"style",14"noscript",15"link[rel='stylesheet']",16"[hidden]",17"[aria-hidden='true']",18"[style*='display: none']",19"[style*='display:none']",20"[style*='visibility: hidden']",21"[style*='visibility:hidden']",22"svg[aria-hidden='true']",23"svg.icon",24"svg[class*='icon']",25"template",26"meta",27"iframe",28"canvas",29"object",30"embed",31"form",32"input",33"select",34"textarea",35"button",36];3738const OVERLAY_SELECTORS = [39"[class*='modal']",40"[class*='popup']",41"[class*='overlay']",42"[class*='dialog']",43"[role='dialog']",44"[role='alertdialog']",45"[class*='cookie']",46"[class*='consent']",47"[class*='gdpr']",48"[class*='privacy-banner']",49"[class*='notification-bar']",50"[id*='cookie']",51"[id*='consent']",52"[id*='gdpr']",53"[style*='position: fixed']",54"[style*='position:fixed']",55"[style*='position: sticky']",56"[style*='position:sticky']",57];5859const NAVIGATION_SELECTORS = [60"header",61"footer",62"nav",63"aside",64".header",65".top",66".navbar",67"#header",68".footer",69".bottom",70"#footer",71".sidebar",72".side",73".aside",74"#sidebar",75".modal",76".popup",77"#modal",78".overlay",79".ad",80".ads",81".advert",82"#ad",83".lang-selector",84".language",85"#language-selector",86".social",87".social-media",88".social-links",89"#social",90".menu",91".navigation",92"#nav",93".breadcrumbs",94"#breadcrumbs",95".share",96"#share",97".widget",98"#widget",99".cookie",100"#cookie",101];102103const FORCE_INCLUDE_SELECTORS = [104"#main",105"#content",106"#main-content",107"#article",108"#post",109"#page-content",110"main",111"article",112"[role='main']",113".main-content",114".content",115".post-content",116".article-content",117".entry-content",118".page-content",119".article-body",120".post-body",121".story-content",122".blog-content",123];124125const AD_SELECTORS = [126"ins.adsbygoogle",127".google-ad",128".adsense",129"[data-ad]",130"[data-ads]",131"[data-ad-slot]",132"[data-ad-client]",133".ad-container",134".ad-wrapper",135".advertisement",136".sponsored-content",137"img[width='1'][height='1']",138"img[src*='pixel']",139"img[src*='tracking']",140"img[src*='analytics']",141];142143function getLinkDensity(element: Element): number {144const text = element.textContent || "";145const textLength = text.trim().length;146if (textLength === 0) {147return 1;148}149150let linkLength = 0;151element.querySelectorAll("a").forEach((link) => {152linkLength += (link.textContent || "").trim().length;153});154155return linkLength / textLength;156}157158function getContentScore(element: Element): number {159let score = 0;160const text = element.textContent || "";161const textLength = text.trim().length;162163score += Math.min(textLength / 100, 50);164score += element.querySelectorAll("p").length * 3;165score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;166score += element.querySelectorAll("img").length;167168score -= element.querySelectorAll("a").length * 0.5;169score -= element.querySelectorAll("li").length * 0.2;170171const linkDensity = getLinkDensity(element);172if (linkDensity > 0.5) {173score -= 30;174} else if (linkDensity > 0.3) {175score -= 15;176}177178const className = typeof element.className === "string" ? element.className : "";179const classAndId = `${className} ${element.id || ""}`;180if (/article|content|post|body|main|entry/i.test(classAndId)) {181score += 25;182}183if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) {184score -= 25;185}186187return score;188}189190function looksLikeNavigation(element: Element): boolean {191const linkDensity = getLinkDensity(element);192if (linkDensity > 0.5) {193return true;194}195196const listItems = element.querySelectorAll("li");197const links = element.querySelectorAll("a");198return listItems.length > 5 && links.length > listItems.length * 0.8;199}200201function removeElements(document: Document, selectors: string[]): void {202for (const selector of selectors) {203try {204document.querySelectorAll(selector).forEach((element) => element.remove());205} catch {206// Ignore unsupported selectors.207}208}209}210211function removeWithProtection(212document: Document,213selectorsToRemove: string[],214protectedSelectors: string[],215): void {216for (const selector of selectorsToRemove) {217try {218document.querySelectorAll(selector).forEach((element) => {219const isProtected = protectedSelectors.some((protectedSelector) => {220try {221return element.matches(protectedSelector);222} catch {223return false;224}225});226227if (isProtected) {228return;229}230231const containsProtected = protectedSelectors.some((protectedSelector) => {232try {233return element.querySelector(protectedSelector) !== null;234} catch {235return false;236}237});238239if (containsProtected) {240return;241}242243element.remove();244});245} catch {246// Ignore unsupported selectors.247}248}249}250251function isValidContent(element: Element | null): element is Element {252if (!element) {253return false;254}255const text = element.textContent || "";256if (text.trim().length < 100) {257return false;258}259return !looksLikeNavigation(element);260}261262function findMainContent(document: Document): Element | null {263const main = document.querySelector("main");264if (isValidContent(main) && getLinkDensity(main) < 0.4) {265return main;266}267268const roleMain = document.querySelector('[role="main"]');269if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {270return roleMain;271}272273const articles = document.querySelectorAll("article");274if (articles.length === 1 && isValidContent(articles[0] ?? null)) {275return articles[0] ?? null;276}277278const contentSelectors = [279"#content",280"#main-content",281"#main",282".content",283".main-content",284".post-content",285".article-content",286".entry-content",287".page-content",288".article-body",289".post-body",290".story-content",291".blog-content",292];293294for (const selector of contentSelectors) {295try {296const element = document.querySelector(selector);297if (isValidContent(element) && getLinkDensity(element) < 0.4) {298return element;299}300} catch {301// Ignore invalid selectors.302}303}304305const candidates: Array<{ element: Element; score: number }> = [];306document.querySelectorAll("div, section, article").forEach((element) => {307const text = element.textContent || "";308if (text.trim().length < 200) {309return;310}311312const score = getContentScore(element);313if (score > 0) {314candidates.push({ element, score });315}316});317318candidates.sort((left, right) => right.score - left.score);319if ((candidates[0]?.score ?? 0) > 20) {320return candidates[0]?.element ?? null;321}322323return null;324}325326function removeBase64ImagesFromDocument(document: Document): void {327document.querySelectorAll("img[src^='data:']").forEach((element) => element.remove());328329document.querySelectorAll("[style*='data:image']").forEach((element) => {330const style = element.getAttribute("style");331if (!style) {332return;333}334335const cleanedStyle = style.replace(336/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,337"",338);339340if (cleanedStyle.trim()) {341element.setAttribute("style", cleanedStyle);342} else {343element.removeAttribute("style");344}345});346347document348.querySelectorAll("source[src^='data:'], source[srcset*='data:']")349.forEach((element) => element.remove());350}351352function makeAbsoluteUrl(value: string, baseUrl: string): string | null {353try {354return new URL(value, baseUrl).toString();355} catch {356return null;357}358}359360function convertRelativeUrls(document: Document, baseUrl: string): void {361document.querySelectorAll("[src]").forEach((element) => {362const src = element.getAttribute("src");363if (!src || src.startsWith("http") || src.startsWith("//") || src.startsWith("data:")) {364return;365}366367const absolute = makeAbsoluteUrl(src, baseUrl);368if (absolute) {369element.setAttribute("src", absolute);370}371});372373document.querySelectorAll("[href]").forEach((element) => {374const href = element.getAttribute("href");375if (376!href ||377href.startsWith("http") ||378href.startsWith("//") ||379href.startsWith("#") ||380href.startsWith("mailto:") ||381href.startsWith("tel:") ||382href.startsWith("javascript:")383) {384return;385}386387const absolute = makeAbsoluteUrl(href, baseUrl);388if (absolute) {389element.setAttribute("href", absolute);390}391});392}393394function removeComments(document: Document): void {395const walker = document.createTreeWalker(document, document.defaultView?.NodeFilter.SHOW_COMMENT ?? 128);396const comments: Comment[] = [];397while (walker.nextNode()) {398comments.push(walker.currentNode as Comment);399}400comments.forEach((comment) => comment.parentNode?.removeChild(comment));401}402403export function cleanHtml(404html: string,405baseUrl: string,406options: CleanHtmlOptions = {},407): string {408const {409removeAds = true,410removeBase64Images = true,411onlyMainContent = true,412includeSelectors,413excludeSelectors,414} = options;415416const dom = new JSDOM(html, { url: baseUrl });417const { document } = dom.window;418419removeElements(document, ALWAYS_REMOVE_SELECTORS);420removeElements(document, OVERLAY_SELECTORS);421422if (removeAds) {423removeElements(document, AD_SELECTORS);424}425426if (excludeSelectors?.length) {427removeElements(document, excludeSelectors);428}429430if (onlyMainContent) {431removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);432433const mainContent = findMainContent(document);434if (mainContent && document.body) {435const clone = mainContent.cloneNode(true);436document.body.innerHTML = "";437document.body.appendChild(clone);438}439}440441if (includeSelectors?.length && document.body) {442const matchedElements: Element[] = [];443for (const selector of includeSelectors) {444try {445document.querySelectorAll(selector).forEach((element) => {446matchedElements.push(element.cloneNode(true) as Element);447});448} catch {449// Ignore invalid selectors.450}451}452453if (matchedElements.length > 0) {454document.body.innerHTML = "";455matchedElements.forEach((element) => document.body?.appendChild(element));456}457}458459if (removeBase64Images) {460removeBase64ImagesFromDocument(document);461}462463removeComments(document);464convertRelativeUrls(document, baseUrl);465466return document.documentElement.outerHTML || html;467}468