Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/extract/html-extractor.ts
1import { Readability } from "@mozilla/readability";2import { JSDOM } from "jsdom";3import type { ExtractedDocument } from "./document";45function getMetaContent(document: Document, selectors: string[]): string | undefined {6for (const selector of selectors) {7const value = document.querySelector(selector)?.getAttribute("content")?.trim();8if (value) {9return value;10}11}12return undefined;13}1415export function extractDocumentFromHtml(input: {16url: string;17html: string;18adapter?: string;19}): ExtractedDocument {20const dom = new JSDOM(input.html, { url: input.url });21const document = dom.window.document;2223const canonicalUrl =24document.querySelector('link[rel="canonical"]')?.getAttribute("href")?.trim() ??25getMetaContent(document, ['meta[property="og:url"]']);2627const siteName = getMetaContent(document, [28'meta[property="og:site_name"]',29'meta[name="application-name"]',30]);3132const metadataAuthor = getMetaContent(document, [33'meta[name="author"]',34'meta[property="article:author"]',35'meta[name="twitter:creator"]',36]);3738const publishedAt = getMetaContent(document, [39'meta[property="article:published_time"]',40'meta[name="pubdate"]',41'meta[name="date"]',42'meta[itemprop="datePublished"]',43]);4445const article = new Readability(document).parse();46const title =47article?.title?.trim() ||48getMetaContent(document, ['meta[property="og:title"]']) ||49document.title.trim() ||50undefined;5152const summary =53article?.excerpt?.trim() ||54getMetaContent(document, [55'meta[name="description"]',56'meta[property="og:description"]',57'meta[name="twitter:description"]',58]);5960const contentHtml =61article?.content?.trim() ||62document.querySelector("main")?.innerHTML?.trim() ||63document.body?.innerHTML?.trim() ||64"";6566const author = article?.byline?.trim() || metadataAuthor;6768return {69url: input.url,70canonicalUrl,71title,72author,73siteName,74publishedAt,75summary,76adapter: input.adapter ?? "generic",77metadata: {78language: document.documentElement.lang || undefined,79},80content: contentHtml ? [{ type: "html", html: contentHtml }] : [],81};82}8384