Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/extract/html-to-markdown.ts
1import { Readability } from "@mozilla/readability";2import { Defuddle } from "defuddle/node";3import { JSDOM, VirtualConsole } from "jsdom";4import TurndownService from "turndown";5import { gfm } from "turndown-plugin-gfm";6import { collectMediaFromMarkdown } from "../media/markdown-media";7import type { MediaAsset } from "../media/types";8import { cleanHtml } from "./html-cleaner";910export interface HtmlConversionMetadata {11url: string;12canonicalUrl?: string;13siteName?: string;14title?: string;15summary?: string;16author?: string;17publishedAt?: string;18coverImage?: string;19language?: string;20capturedAt: string;21}2223export interface ConvertHtmlToMarkdownOptions {24enableRemoteMarkdownFallback?: boolean;25preserveBase64Images?: boolean;26}2728export interface HtmlToMarkdownResult {29metadata: HtmlConversionMetadata;30markdown: string;31rawHtml: string;32cleanedHtml: string;33media: MediaAsset[];34conversionMethod: string;35fallbackReason?: string;36}3738type JsonObject = Record<string, unknown>;3940const MIN_CONTENT_LENGTH = 120;41const DEFUDDLE_API_ORIGIN = "https://defuddle.md";42const LOCAL_FALLBACK_SCORE_DELTA = 120;43const REMOTE_FALLBACK_SCORE_DELTA = 20;44const LOW_QUALITY_MARKERS = [45/Join The Conversation/i,46/One Community\. Many Voices/i,47/Read our community guidelines/i,48/Create a free account to share your thoughts/i,49/Become a Forbes Member/i,50/Subscribe to trusted journalism/i,51/\bComments\b/i,52];5354const ARTICLE_TYPES = new Set([55"Article",56"NewsArticle",57"BlogPosting",58"WebPage",59"ReportageNewsArticle",60]);6162const turndown = new TurndownService({63headingStyle: "atx",64bulletListMarker: "-",65codeBlockStyle: "fenced",66}) as TurndownService & {67remove(selectors: string[]): void;68addRule(69key: string,70rule: {71filter: string | ((node: Node) => boolean);72replacement: (content: string) => string;73},74): void;75};7677turndown.use(gfm);78turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);79turndown.addRule("collapseFigure", {80filter: "figure",81replacement(content: string) {82return `\n\n${content.trim()}\n\n`;83},84});85turndown.addRule("dropInvisibleAnchors", {86filter(node: Node) {87return (88node.nodeName === "A" &&89!(node as Element).textContent?.trim() &&90!(node as Element).querySelector("img, video, picture, source")91);92},93replacement() {94return "";95},96});9798function pickString(...values: unknown[]): string | undefined {99for (const value of values) {100if (typeof value !== "string") {101continue;102}103const trimmed = value.trim();104if (trimmed) {105return trimmed;106}107}108return undefined;109}110111function normalizeMarkdown(markdown: string): string {112return markdown113.replace(/\r\n/g, "\n")114.replace(/[ \t]+\n/g, "\n")115.replace(/\n{3,}/g, "\n\n")116.trim();117}118119function stripWrappingQuotes(value: string): string {120const trimmed = value.trim();121if (122(trimmed.startsWith('"') && trimmed.endsWith('"')) ||123(trimmed.startsWith("'") && trimmed.endsWith("'"))124) {125return trimmed.slice(1, -1).trim();126}127return trimmed;128}129130function stripMarkdownFrontmatter(markdown: string): string {131return markdown.replace(/^\uFEFF?---\n[\s\S]*?\n---(?:\n|$)/, "").trim();132}133134function cleanMarkdownTitle(value: string): string | undefined {135const cleaned = stripWrappingQuotes(136value137.replace(/\s+#+\s*$/, "")138.replace(/!\[[^\]]*\]\([^)]+\)/g, "")139.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")140.replace(/[*_`~]/g, "")141.trim(),142);143144return cleaned || undefined;145}146147export function extractTitleFromMarkdownDocument(markdown: string): string | undefined {148const normalized = markdown.replace(/\r\n/g, "\n").trim();149if (!normalized) {150return undefined;151}152153const frontmatterMatch = normalized.match(/^\uFEFF?---\n([\s\S]*?)\n---(?:\n|$)/);154if (frontmatterMatch) {155for (const line of frontmatterMatch[1].split("\n")) {156const match = line.match(/^title:\s*(.+?)\s*$/i);157if (!match) {158continue;159}160161const title = cleanMarkdownTitle(match[1]);162if (title) {163return title;164}165}166}167168const body = stripMarkdownFrontmatter(normalized);169const headingMatch = body.match(/^#{1,6}\s+(.+)$/m);170if (!headingMatch) {171return undefined;172}173174return cleanMarkdownTitle(headingMatch[1]);175}176177function trimKnownBoilerplate(markdown: string): string {178const normalized = normalizeMarkdown(markdown);179const lines = normalized.split("\n");180181while (lines.length > 0) {182const lastLine = lines[lines.length - 1]?.trim();183if (!lastLine) {184lines.pop();185continue;186}187188if (/^继续滑动看下一个$/.test(lastLine) || /^轻触阅读原文$/.test(lastLine)) {189lines.pop();190continue;191}192193break;194}195196return normalizeMarkdown(lines.join("\n"));197}198199function buildDefuddleApiUrl(targetUrl: string): string {200return `${DEFUDDLE_API_ORIGIN}/${encodeURIComponent(targetUrl)}`;201}202203async function fetchDefuddleApiMarkdown(204targetUrl: string,205): Promise<{ markdown: string; title?: string }> {206const response = await fetch(buildDefuddleApiUrl(targetUrl), {207headers: {208accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1",209},210redirect: "follow",211});212213if (!response.ok) {214throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`);215}216217const rawMarkdown = (await response.text()).replace(/\r\n/g, "\n").trim();218if (!rawMarkdown) {219throw new Error("defuddle.md returned empty markdown");220}221222const title = extractTitleFromMarkdownDocument(rawMarkdown);223const markdown = trimKnownBoilerplate(stripMarkdownFrontmatter(rawMarkdown));224if (!markdown) {225throw new Error("defuddle.md returned empty markdown");226}227228return {229markdown,230title,231};232}233234function sanitizeHtmlFragment(html: string): string {235const dom = new JSDOM(`<div id="__root">${html}</div>`);236const root = dom.window.document.querySelector("#__root");237if (!root) {238return html;239}240241for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {242root.querySelectorAll(selector).forEach((element) => element.remove());243}244245return root.innerHTML;246}247248function extractTextFromHtml(html: string): string {249const dom = new JSDOM(`<!doctype html><html><body>${html}</body></html>`);250const { document } = dom.window;251for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {252document.querySelectorAll(selector).forEach((element) => element.remove());253}254return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";255}256257function getMetaContent(document: Document, names: string[]): string | undefined {258for (const name of names) {259const element =260document.querySelector(`meta[name="${name}"]`) ??261document.querySelector(`meta[property="${name}"]`);262const content = element?.getAttribute("content")?.trim();263if (content) {264return content;265}266}267return undefined;268}269270function normalizeLanguageTag(value: string | null | undefined): string | undefined {271if (!value) {272return undefined;273}274275const trimmed = value.trim();276if (!trimmed) {277return undefined;278}279280const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();281if (!primary) {282return undefined;283}284285return primary.replace(/_/g, "-");286}287288function flattenJsonLdItems(data: unknown): JsonObject[] {289if (!data || typeof data !== "object") {290return [];291}292293if (Array.isArray(data)) {294return data.flatMap(flattenJsonLdItems);295}296297const item = data as JsonObject;298if (Array.isArray(item["@graph"])) {299return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);300}301302return [item];303}304305function parseJsonLdScripts(document: Document): JsonObject[] {306const results: JsonObject[] = [];307document.querySelectorAll("script[type='application/ld+json']").forEach((script) => {308try {309const data = JSON.parse(script.textContent ?? "");310results.push(...flattenJsonLdItems(data));311} catch {312// Ignore malformed json-ld blocks.313}314});315return results;316}317318function extractAuthorFromJsonLd(authorData: unknown): string | undefined {319if (typeof authorData === "string") {320return authorData.trim() || undefined;321}322323if (!authorData || typeof authorData !== "object") {324return undefined;325}326327if (Array.isArray(authorData)) {328return authorData329.map((author) => extractAuthorFromJsonLd(author))330.filter((value): value is string => Boolean(value))331.join(", ") || undefined;332}333334const author = authorData as JsonObject;335return pickString(author.name);336}337338function extractPrimaryJsonLdMeta(document: Document): Partial<HtmlConversionMetadata> {339for (const item of parseJsonLdScripts(document)) {340const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];341if (typeof type !== "string" || !ARTICLE_TYPES.has(type)) {342continue;343}344345return {346title: pickString(item.headline, item.name),347summary: pickString(item.description),348author: extractAuthorFromJsonLd(item.author),349publishedAt: pickString(item.datePublished, item.dateCreated),350coverImage: pickString(351item.image,352(item.image as JsonObject | undefined)?.url,353Array.isArray(item.image) ? item.image[0] : undefined,354),355};356}357358return {};359}360361function extractPageMetadata(362html: string,363url: string,364capturedAt: string,365): HtmlConversionMetadata {366const dom = new JSDOM(html, { url });367const { document } = dom.window;368const jsonLd = extractPrimaryJsonLdMeta(document);369370return {371url,372canonicalUrl:373document.querySelector('link[rel="canonical"]')?.getAttribute("href")?.trim() ??374getMetaContent(document, ["og:url"]),375siteName: pickString(376getMetaContent(document, ["og:site_name"]),377document.querySelector('meta[name="application-name"]')?.getAttribute("content"),378),379title: pickString(380getMetaContent(document, ["og:title", "twitter:title"]),381jsonLd.title,382document.querySelector("h1")?.textContent,383document.title,384),385summary: pickString(386getMetaContent(document, ["description", "og:description", "twitter:description"]),387jsonLd.summary,388),389author: pickString(390getMetaContent(document, ["author", "article:author", "twitter:creator"]),391jsonLd.author,392),393publishedAt: pickString(394document.querySelector("time[datetime]")?.getAttribute("datetime"),395getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),396jsonLd.publishedAt,397),398coverImage: pickString(399getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),400jsonLd.coverImage,401),402language: pickString(403normalizeLanguageTag(document.documentElement.getAttribute("lang")),404normalizeLanguageTag(405pickString(406getMetaContent(document, ["language", "content-language", "og:locale"]),407document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content"),408),409),410),411capturedAt,412};413}414415function isMarkdownUsable(markdown: string, html: string): boolean {416const normalized = normalizeMarkdown(markdown);417if (!normalized) {418return false;419}420421const htmlTextLength = extractTextFromHtml(html).length;422if (htmlTextLength < MIN_CONTENT_LENGTH) {423return true;424}425426if (normalized.length >= 80) {427return true;428}429430return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));431}432433function countMarkerHits(markdown: string, markers: RegExp[]): number {434let hits = 0;435for (const marker of markers) {436if (marker.test(markdown)) {437hits += 1;438}439}440return hits;441}442443function countUsefulParagraphs(markdown: string): number {444const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);445let count = 0;446447for (const paragraph of paragraphs) {448const trimmed = paragraph.trim();449if (!trimmed) {450continue;451}452if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) {453continue;454}455if (/^#{1,6}\s+/.test(trimmed)) {456continue;457}458if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) {459continue;460}461count += 1;462}463464return count;465}466467function scoreMarkdownQuality(markdown: string): number {468const normalized = normalizeMarkdown(markdown);469const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;470const usefulParagraphs = countUsefulParagraphs(normalized);471const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;472const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);473return Math.min(wordCount, 4000) + usefulParagraphs * 40 + headingCount * 10 - markerHits * 180;474}475476function shouldCompareWithFallback(markdown: string): boolean {477const normalized = normalizeMarkdown(markdown);478return countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || countUsefulParagraphs(normalized) < 6;479}480481function hasMeaningfulMarkdownStructure(markdown: string): boolean {482const normalized = normalizeMarkdown(markdown);483if (!normalized) {484return false;485}486487return (488countUsefulParagraphs(normalized) > 0 ||489/^#{1,6}\s+/m.test(normalized) ||490/^[-*]\s+/m.test(normalized) ||491/^\d+\.\s+/m.test(normalized) ||492/!\[[^\]]*\]\([^)]+\)/.test(normalized)493);494}495496function shouldTryRemoteMarkdownFallback(497markdown: string,498html: string,499options: ConvertHtmlToMarkdownOptions,500): boolean {501if (!options.enableRemoteMarkdownFallback) {502return false;503}504505return !isMarkdownUsable(markdown, html) || shouldCompareWithFallback(markdown);506}507508function shouldPreferRemoteMarkdown(509current: HtmlToMarkdownResult,510remote: HtmlToMarkdownResult,511html: string,512): boolean {513if (!isMarkdownUsable(current.markdown, html)) {514return true;515}516517if (!hasMeaningfulMarkdownStructure(current.markdown) && hasMeaningfulMarkdownStructure(remote.markdown)) {518return true;519}520521return scoreMarkdownQuality(remote.markdown) > scoreMarkdownQuality(current.markdown) + REMOTE_FALLBACK_SCORE_DELTA;522}523524function buildRemoteFallbackReason(current: HtmlToMarkdownResult, html: string): string {525if (!isMarkdownUsable(current.markdown, html)) {526return current.fallbackReason527? `Used defuddle.md markdown fallback after local extraction failed: ${current.fallbackReason}`528: "Used defuddle.md markdown fallback after local extraction returned empty or incomplete markdown";529}530531return "defuddle.md produced higher-quality markdown than local extraction";532}533534async function tryDefuddleConversion(535html: string,536url: string,537baseMetadata: HtmlConversionMetadata,538): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {539try {540const virtualConsole = new VirtualConsole();541virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {542if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {543return;544}545});546547const dom = new JSDOM(html, { url, virtualConsole });548const result = await Defuddle(dom, url, { markdown: true });549const markdown = trimKnownBoilerplate(result.content || "");550551if (!isMarkdownUsable(markdown, html)) {552return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };553}554555const metadata: HtmlConversionMetadata = {556...baseMetadata,557title: pickString(result.title, baseMetadata.title),558summary: pickString(result.description, baseMetadata.summary),559author: pickString(result.author, baseMetadata.author),560publishedAt: pickString(result.published, baseMetadata.publishedAt),561coverImage: pickString(result.image, baseMetadata.coverImage),562language: pickString(result.language, baseMetadata.language),563};564565return {566ok: true,567result: {568metadata,569markdown,570rawHtml: html,571cleanedHtml: html,572media: collectMediaFromMarkdown(markdown).concat(573metadata.coverImage574? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]575: [],576),577conversionMethod: "defuddle",578},579};580} catch (error) {581return {582ok: false,583reason: error instanceof Error ? error.message : String(error),584};585}586}587588async function tryDefuddleApiConversion(589html: string,590url: string,591baseMetadata: HtmlConversionMetadata,592): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {593try {594const result = await fetchDefuddleApiMarkdown(url);595const markdown = result.markdown;596597if (!isMarkdownUsable(markdown, html) && scoreMarkdownQuality(markdown) < 80) {598return { ok: false, reason: "defuddle.md returned empty or incomplete markdown" };599}600601const metadata: HtmlConversionMetadata = {602...baseMetadata,603title: pickString(result.title, baseMetadata.title),604};605606return {607ok: true,608result: {609metadata,610markdown,611rawHtml: html,612cleanedHtml: html,613media: collectMediaFromMarkdown(markdown).concat(614metadata.coverImage615? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]616: [],617),618conversionMethod: "defuddle-api",619},620};621} catch (error) {622return {623ok: false,624reason: error instanceof Error ? error.message : String(error),625};626}627}628629function convertHtmlFragmentToMarkdown(html: string): string {630if (!html.trim()) {631return "";632}633634try {635return turndown.turndown(sanitizeHtmlFragment(html));636} catch {637return "";638}639}640641function fallbackPlainText(html: string): string {642return trimKnownBoilerplate(extractTextFromHtml(html));643}644645function convertWithReadability(646rawHtml: string,647cleanedHtml: string,648url: string,649baseMetadata: HtmlConversionMetadata,650): HtmlToMarkdownResult {651const dom = new JSDOM(cleanedHtml, { url });652const document = dom.window.document;653const article = new Readability(document).parse();654655const contentHtml =656article?.content?.trim() ??657document.querySelector("main")?.innerHTML?.trim() ??658document.body?.innerHTML?.trim() ??659"";660661let markdown = contentHtml ? convertHtmlFragmentToMarkdown(contentHtml) : "";662if (!markdown) {663markdown = fallbackPlainText(cleanedHtml);664}665666const metadata: HtmlConversionMetadata = {667...baseMetadata,668title: pickString(article?.title, baseMetadata.title),669summary: pickString(article?.excerpt, baseMetadata.summary),670author: pickString(article?.byline, baseMetadata.author),671};672673const media = collectMediaFromMarkdown(markdown);674if (metadata.coverImage) {675media.unshift({676url: metadata.coverImage,677kind: "image",678role: "cover",679});680}681682return {683metadata,684markdown: trimKnownBoilerplate(markdown),685rawHtml,686cleanedHtml,687media,688conversionMethod: article?.content ? "legacy:readability" : "legacy:body",689};690}691692export async function convertHtmlToMarkdown(693html: string,694url: string,695options: ConvertHtmlToMarkdownOptions = {},696): Promise<HtmlToMarkdownResult> {697const capturedAt = new Date().toISOString();698const baseMetadata = extractPageMetadata(html, url, capturedAt);699700let cleanedHtml = html;701try {702cleanedHtml = cleanHtml(html, url, {703removeBase64Images: !options.preserveBase64Images,704});705} catch {706cleanedHtml = html;707}708709let selectedResult: HtmlToMarkdownResult;710const defuddleResult = await tryDefuddleConversion(cleanedHtml, url, baseMetadata);711if (defuddleResult.ok) {712if (shouldCompareWithFallback(defuddleResult.result.markdown)) {713const fallbackResult = convertWithReadability(html, cleanedHtml, url, baseMetadata);714if (715scoreMarkdownQuality(fallbackResult.markdown) >716scoreMarkdownQuality(defuddleResult.result.markdown) + LOCAL_FALLBACK_SCORE_DELTA717) {718selectedResult = {719...fallbackResult,720fallbackReason: "Readability/Turndown produced higher-quality markdown than Defuddle",721};722} else {723selectedResult = {724...defuddleResult.result,725rawHtml: html,726cleanedHtml,727};728}729} else {730selectedResult = {731...defuddleResult.result,732rawHtml: html,733cleanedHtml,734};735}736} else {737selectedResult = {738...convertWithReadability(html, cleanedHtml, url, baseMetadata),739fallbackReason: defuddleResult.reason,740};741}742743if (!shouldTryRemoteMarkdownFallback(selectedResult.markdown, cleanedHtml, options)) {744return selectedResult;745}746747const remoteDefuddleResult = await tryDefuddleApiConversion(cleanedHtml, url, baseMetadata);748if (!remoteDefuddleResult.ok || !shouldPreferRemoteMarkdown(selectedResult, remoteDefuddleResult.result, cleanedHtml)) {749return selectedResult;750}751752return {753...remoteDefuddleResult.result,754rawHtml: html,755cleanedHtml,756fallbackReason: buildRemoteFallbackReason(selectedResult, cleanedHtml),757};758}759