Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/media/markdown-media.ts
1import remarkGfm from "remark-gfm";2import remarkParse from "remark-parse";3import { unified } from "unified";4import type { ContentBlock, ExtractedDocument } from "../extract/document";5import {6isDataUri,7normalizeContentType,8normalizeMediaUrl,9resolveExtensionFromContentType,10resolveExtensionFromUrl,11resolveKindFromExtension,12} from "./media-utils";13import type { MediaAsset, MediaReplacement } from "./types";1415const MARKDOWN_LINK_RE =16/(!?\[[^\]\n]*\])\((<)?((?:https?:\/\/[^)\s>]+)|(?:data:[^)>\s]+))(>)?\)/g;17const FRONTMATTER_COVER_RE = /^(coverImage:\s*")((?:https?:\/\/[^"]+)|(?:data:[^"]+))(")/m;18const RAW_URL_RE = /(?:https?:\/\/[^\s<>"')\]]+|data:[^\s<>"')\]]+)/g;1920interface MarkdownAstNode {21type: string;22url?: string | null;23alt?: string | null;24title?: string | null;25value?: string | null;26children?: MarkdownAstNode[];27position?: {28start?: { offset?: number | null };29end?: { offset?: number | null };30};31}3233interface MarkdownReplacementRange {34start: number;35end: number;36value: string;37}3839function inferMediaKindFromLabel(label: string, rawUrl: string): "image" | "video" | undefined {40if (label.startsWith("![")) {41return "image";42}4344const normalizedLabel = label.replace(/[!\[\]]/g, "").trim().toLowerCase();45if (/\b(video|animated[_ -]?gif|gif)\b/.test(normalizedLabel)) {46return "video";47}4849if (isDataUri(rawUrl)) {50const contentType = normalizeContentType(rawUrl.slice(5, rawUrl.indexOf(";")));51return contentType.startsWith("image/") ? "image" : contentType.startsWith("video/") ? "video" : undefined;52}5354return resolveKindFromExtension(resolveExtensionFromUrl(rawUrl));55}5657function inferMediaKindFromRawUrl(rawUrl: string): "image" | "video" | undefined {58if (isDataUri(rawUrl)) {59const contentType = normalizeContentType(rawUrl.slice(5, rawUrl.indexOf(";")));60return contentType.startsWith("image/") ? "image" : contentType.startsWith("video/") ? "video" : undefined;61}6263return resolveKindFromExtension(resolveExtensionFromUrl(rawUrl));64}6566function pushMedia(assets: MediaAsset[], seen: Set<string>, media: MediaAsset): void {67const normalizedUrl = normalizeMediaUrl(media.url);68if (!normalizedUrl || seen.has(normalizedUrl)) {69return;70}71seen.add(normalizedUrl);72assets.push({73...media,74url: normalizedUrl,75});76}7778function getNodeOffsets(node: MarkdownAstNode): { start: number; end: number } | null {79const start = node.position?.start?.offset;80const end = node.position?.end?.offset;81if (typeof start !== "number" || typeof end !== "number" || start < 0 || end < start) {82return null;83}84return { start, end };85}8687function escapeMarkdownLabel(value: string): string {88return value.replace(/\\/g, "\\\\").replace(/\[/g, "\\[").replace(/\]/g, "\\]");89}9091function escapeMarkdownTitle(value: string): string {92return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');93}9495function formatMarkdownDestination(url: string): string {96return /[\s()<>]/.test(url) ? `<${url}>` : url;97}9899function serializeImageNode(node: MarkdownAstNode): string {100const rawUrl = node.url ?? "";101const normalizedUrl = normalizeMediaUrl(rawUrl);102const alt = escapeMarkdownLabel(node.alt ?? "");103const title = node.title ? ` "${escapeMarkdownTitle(node.title)}"` : "";104return `}${title})`;105}106107function serializeLinkedImageNode(linkNode: MarkdownAstNode, imageNode: MarkdownAstNode): string {108const imageMarkdown = serializeImageNode(imageNode);109const imageUrl = normalizeMediaUrl(imageNode.url ?? "");110const linkUrl = normalizeMediaUrl(linkNode.url ?? "");111112if (!linkUrl || linkUrl === imageUrl) {113return imageMarkdown;114}115116const title = linkNode.title ? ` "${escapeMarkdownTitle(linkNode.title)}"` : "";117return `[${imageMarkdown}](${formatMarkdownDestination(linkUrl)}${title})`;118}119120function isParagraphWithSingleText(node: MarkdownAstNode | undefined, expectedValue: string): boolean {121if (node?.type !== "paragraph" || node.children?.length !== 1) {122return false;123}124125const child = node.children[0];126return child?.type === "text" && child.value?.trim() === expectedValue;127}128129function getSingleImageFromParagraph(node: MarkdownAstNode | undefined): MarkdownAstNode | null {130if (node?.type !== "paragraph" || node.children?.length !== 1) {131return null;132}133134return node.children[0]?.type === "image" ? node.children[0] : null;135}136137function extractBrokenLinkedImageDestination(node: MarkdownAstNode | undefined): string | null {138if (node?.type !== "paragraph") {139return null;140}141142const children = node.children ?? [];143if (children.length !== 3) {144return null;145}146147const [prefix, linkNode, suffix] = children;148if (prefix?.type !== "text" || prefix.value?.trim() !== "](") {149return null;150}151if (linkNode?.type !== "link" || !linkNode.url) {152return null;153}154if (suffix?.type !== "text" || suffix.value?.trim() !== ")") {155return null;156}157158return linkNode.url;159}160161function collectLinkedImageReplacements(162node: MarkdownAstNode,163replacements: MarkdownReplacementRange[],164): void {165const children = node.children ?? [];166167if (node.type === "link" && children.length === 1 && children[0]?.type === "image") {168const offsets = getNodeOffsets(node);169if (offsets) {170replacements.push({171start: offsets.start,172end: offsets.end,173value: serializeLinkedImageNode(node, children[0]),174});175}176return;177}178179for (const child of children) {180collectLinkedImageReplacements(child, replacements);181}182}183184function collectBrokenLinkedImageReplacements(185node: MarkdownAstNode,186replacements: MarkdownReplacementRange[],187): void {188const children = node.children ?? [];189for (let index = 0; index <= children.length - 3; index += 1) {190const openParagraph = children[index];191const imageParagraph = children[index + 1];192const closeParagraph = children[index + 2];193194if (!isParagraphWithSingleText(openParagraph, "[")) {195continue;196}197198const imageNode = getSingleImageFromParagraph(imageParagraph);199if (!imageNode) {200continue;201}202203const linkUrl = extractBrokenLinkedImageDestination(closeParagraph);204if (!linkUrl) {205continue;206}207208const start = openParagraph.position?.start?.offset;209const end = closeParagraph.position?.end?.offset;210if (typeof start !== "number" || typeof end !== "number" || end < start) {211continue;212}213214replacements.push({215start,216end,217value: serializeLinkedImageNode({ type: "link", url: linkUrl }, imageNode),218});219220index += 2;221}222223for (const child of children) {224collectBrokenLinkedImageReplacements(child, replacements);225}226}227228function applyReplacements(source: string, replacements: MarkdownReplacementRange[]): string {229if (replacements.length === 0) {230return source;231}232233let result = source;234const sorted = [...replacements].sort((left, right) => right.start - left.start);235for (const replacement of sorted) {236result = `${result.slice(0, replacement.start)}${replacement.value}${result.slice(replacement.end)}`;237}238return result;239}240241function normalizeLinkedImageMarkdown(markdown: string): string {242let tree: MarkdownAstNode;243try {244tree = unified().use(remarkParse).use(remarkGfm).parse(markdown) as MarkdownAstNode;245} catch {246return markdown;247}248249const replacements: MarkdownReplacementRange[] = [];250collectLinkedImageReplacements(tree, replacements);251collectBrokenLinkedImageReplacements(tree, replacements);252return applyReplacements(markdown, replacements);253}254255export function normalizeMarkdownMediaLinks(markdown: string): string {256MARKDOWN_LINK_RE.lastIndex = 0;257let result = markdown.replace(MARKDOWN_LINK_RE, (full, label, openAngle, rawUrl, closeAngle) => {258const normalizedUrl = normalizeMediaUrl(rawUrl);259if (normalizedUrl === rawUrl) {260return full;261}262return `${label}(${openAngle ?? ""}${normalizedUrl}${closeAngle ?? ""})`;263});264265result = result.replace(FRONTMATTER_COVER_RE, (full, prefix, rawUrl, suffix) => {266const normalizedUrl = normalizeMediaUrl(rawUrl);267if (normalizedUrl === rawUrl) {268return full;269}270return `${prefix}${normalizedUrl}${suffix}`;271});272273RAW_URL_RE.lastIndex = 0;274result = result.replace(RAW_URL_RE, (rawUrl) => normalizeMediaUrl(rawUrl));275return normalizeLinkedImageMarkdown(result);276}277278export function collectMediaFromText(279text: string,280options: {281role?: MediaAsset["role"];282defaultKind?: MediaAsset["kind"];283seen?: Set<string>;284into?: MediaAsset[];285} = {},286): MediaAsset[] {287const assets = options.into ?? [];288const seen = options.seen ?? new Set<string>();289290MARKDOWN_LINK_RE.lastIndex = 0;291let linkMatch: RegExpExecArray | null;292while ((linkMatch = MARKDOWN_LINK_RE.exec(text))) {293const label = linkMatch[1] ?? "";294const rawUrl = linkMatch[3] ?? "";295const kind = inferMediaKindFromLabel(label, rawUrl) ?? options.defaultKind;296if (!kind) {297continue;298}299pushMedia(assets, seen, {300url: rawUrl,301kind,302role: options.role ?? "inline",303});304}305306RAW_URL_RE.lastIndex = 0;307let rawMatch: RegExpExecArray | null;308while ((rawMatch = RAW_URL_RE.exec(text))) {309const rawUrl = rawMatch[0] ?? "";310const kind = inferMediaKindFromRawUrl(rawUrl) ?? options.defaultKind;311if (!kind) {312continue;313}314pushMedia(assets, seen, {315url: rawUrl,316kind,317role: options.role ?? "inline",318});319}320321return assets;322}323324function collectMediaFromBlock(325block: ContentBlock,326assets: MediaAsset[],327seen: Set<string>,328): void {329switch (block.type) {330case "image":331pushMedia(assets, seen, {332url: block.url,333kind: "image",334role: "inline",335alt: block.alt,336});337return;338case "html":339case "markdown":340collectMediaFromText(block.type === "html" ? block.html : block.markdown, {341role: "inline",342seen,343into: assets,344});345return;346case "paragraph":347case "quote":348collectMediaFromText(block.text, {349role: "inline",350seen,351into: assets,352});353return;354case "list":355for (const item of block.items) {356collectMediaFromText(item, {357role: "attachment",358seen,359into: assets,360});361}362return;363case "heading":364case "code":365return;366}367}368369export function collectMediaFromDocument(document: ExtractedDocument): MediaAsset[] {370const assets: MediaAsset[] = [];371const seen = new Set<string>();372const coverImage =373typeof document.metadata?.coverImage === "string" ? document.metadata.coverImage : undefined;374375if (coverImage) {376pushMedia(assets, seen, {377url: coverImage,378kind: "image",379role: "cover",380});381}382383for (const block of document.content) {384collectMediaFromBlock(block, assets, seen);385}386387return assets;388}389390export function collectMediaFromMarkdown(markdown: string): MediaAsset[] {391const assets: MediaAsset[] = [];392const seen = new Set<string>();393const fmMatch = markdown.match(/^---\n([\s\S]*?)\n---/);394if (fmMatch) {395const coverMatch = fmMatch[1]?.match(FRONTMATTER_COVER_RE);396if (coverMatch?.[2]) {397pushMedia(assets, seen, {398url: coverMatch[2],399kind: "image",400role: "cover",401});402}403}404405collectMediaFromText(markdown, { seen, into: assets });406return assets;407}408409export function rewriteMarkdownMediaLinks(410markdown: string,411replacements: MediaReplacement[],412): string {413if (replacements.length === 0) {414return markdown;415}416417const replacementMap = new Map<string, string>();418for (const item of replacements) {419replacementMap.set(item.url, item.localPath);420replacementMap.set(normalizeMediaUrl(item.url), item.localPath);421}422423MARKDOWN_LINK_RE.lastIndex = 0;424let result = markdown.replace(MARKDOWN_LINK_RE, (full, label, _openAngle, rawUrl) => {425const replacement = replacementMap.get(rawUrl) ?? replacementMap.get(normalizeMediaUrl(rawUrl));426if (!replacement) {427return full;428}429return `${label}(${replacement})`;430});431432result = result.replace(FRONTMATTER_COVER_RE, (full, prefix, rawUrl, suffix) => {433const replacement = replacementMap.get(rawUrl) ?? replacementMap.get(normalizeMediaUrl(rawUrl));434if (!replacement) {435return full;436}437return `${prefix}${replacement}${suffix}`;438});439440for (const { url, localPath } of replacements) {441result = result.split(url).join(localPath);442const normalizedUrl = normalizeMediaUrl(url);443if (normalizedUrl !== url) {444result = result.split(normalizedUrl).join(localPath);445}446}447448return result;449}450451export function resolveDataUriExtension(rawUrl: string): string | undefined {452if (!isDataUri(rawUrl)) {453return undefined;454}455const separatorIndex = rawUrl.indexOf(";");456const contentType = normalizeContentType(rawUrl.slice(5, separatorIndex === -1 ? undefined : separatorIndex));457return resolveExtensionFromContentType(contentType);458}459