Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/adapters/youtube/transcript.ts
1import type { ExtractedDocument } from "../../extract/document";2import { detectInteractionGate } from "../../browser/interaction-gates";3import {4buildYouTubeThumbnailCandidates,5parseYouTubeDescriptionChapters,6renderYouTubeTranscriptMarkdown,7type YouTubeChapter,8type YouTubeTranscriptSegment,9} from "./utils";1011interface CaptionInfo {12captionUrl: string;13language: string;14kind: string;15available: string[];16title?: string;17author?: string;18authorUrl?: string;19channelId?: string;20description?: string;21publishedAt?: string;22viewCount?: number;23durationSeconds?: number;24keywords: string[];25category?: string;26isLiveContent?: boolean;27coverImages: string[];28}2930function normalizeUrl(url: string | undefined): string | undefined {31if (!url) {32return undefined;33}3435try {36const parsed = new URL(url);37if (parsed.protocol === "http:") {38parsed.protocol = "https:";39}40return parsed.toString();41} catch {42return url;43}44}4546function buildSummary(description: string | undefined, segments: YouTubeTranscriptSegment[]): string | undefined {47const descriptionSummary = description48?.replace(/\r\n/g, "\n")49.split("\n")50.map((line) => line.trim())51.find((line) => line && !/^https?:\/\//i.test(line));5253if (descriptionSummary) {54return descriptionSummary.slice(0, 240);55}5657const transcriptSummary = segments58.slice(0, 8)59.map((segment) => segment.text)60.join(" ")61.slice(0, 240)62.trim();6364return transcriptSummary || undefined;65}6667async function canFetchThumbnail(url: string): Promise<boolean> {68try {69const response = await fetch(url, { method: "HEAD", redirect: "follow" });70if (response.ok) {71return true;72}7374if (response.status === 405) {75const fallbackResponse = await fetch(url, {76method: "GET",77headers: { Range: "bytes=0-0" },78redirect: "follow",79});80return fallbackResponse.ok;81}82} catch {83return false;84}8586return false;87}8889async function resolveBestCoverImage(videoId: string, coverImages: string[]): Promise<string | undefined> {90const candidates = buildYouTubeThumbnailCandidates(videoId, coverImages);9192for (const candidate of candidates) {93if (await canFetchThumbnail(candidate)) {94return candidate;95}96}9798return candidates[0];99}100101export async function extractYouTubeTranscriptDocument(102context: Parameters<import("../types").Adapter["process"]>[0],103videoId: string,104): Promise<ExtractedDocument | null> {105const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;106await context.browser.goto(videoUrl, context.timeoutMs);107108const interaction = await detectInteractionGate(context.browser);109if (interaction) {110context.log.debug(`Interaction gate detected on YouTube: ${interaction.provider}`);111return null;112}113114try {115await context.network.waitForIdle({116idleMs: 1_000,117timeoutMs: Math.min(context.timeoutMs, 8_000),118});119} catch {120context.log.debug("Network idle timed out on YouTube load.");121}122123const captionInfo = await context.browser.evaluate<CaptionInfo | { error: string }>(`124(async () => {125function readText(value) {126if (!value) return undefined;127if (typeof value === 'string') {128const text = value.trim();129return text || undefined;130}131if (typeof value.simpleText === 'string') {132const text = value.simpleText.trim();133return text || undefined;134}135if (Array.isArray(value.runs)) {136const text = value.runs137.map((run) => typeof run?.text === 'string' ? run.text : '')138.join('')139.trim();140return text || undefined;141}142return undefined;143}144145function parsePositiveInteger(value) {146if (typeof value === 'number' && Number.isFinite(value) && value >= 0) {147return Math.floor(value);148}149if (typeof value !== 'string') {150return undefined;151}152const normalized = value.replace(/[^\\d]/g, '');153if (!normalized) {154return undefined;155}156const parsed = Number.parseInt(normalized, 10);157return Number.isFinite(parsed) ? parsed : undefined;158}159160const apiKey = window.ytcfg?.data_?.INNERTUBE_API_KEY;161const playerResponse = window.ytInitialPlayerResponse;162const videoDetails = playerResponse?.videoDetails || {};163const microformat = playerResponse?.microformat?.playerMicroformatRenderer || {};164const title =165videoDetails.title ||166readText(microformat.title) ||167document.title.replace(/ - YouTube$/, '').trim();168const author =169videoDetails.author ||170microformat.ownerChannelName ||171document.querySelector('link[itemprop="name"]')?.getAttribute('content') ||172undefined;173const authorUrl =174microformat.ownerProfileUrl ||175(typeof videoDetails.channelId === 'string' && videoDetails.channelId176? 'https://www.youtube.com/channel/' + videoDetails.channelId177: undefined);178const description =179readText(microformat.description) ||180(typeof videoDetails.shortDescription === 'string' ? videoDetails.shortDescription.trim() : undefined);181const keywords = Array.isArray(videoDetails.keywords)182? videoDetails.keywords.filter((keyword) => typeof keyword === 'string' && keyword.trim())183: [];184const thumbnails = [185...(Array.isArray(videoDetails.thumbnail?.thumbnails) ? videoDetails.thumbnail.thumbnails : []),186...(Array.isArray(microformat.thumbnail?.thumbnails) ? microformat.thumbnail.thumbnails : []),187]188.filter((thumbnail) => typeof thumbnail?.url === 'string' && thumbnail.url)189.sort((left, right) => ((right?.width || 0) * (right?.height || 0)) - ((left?.width || 0) * (left?.height || 0)))190.map((thumbnail) => thumbnail.url);191192if (!apiKey) {193return { error: 'INNERTUBE_API_KEY not found on page' };194}195196const response = await fetch('/youtubei/v1/player?key=' + apiKey + '&prettyPrint=false', {197method: 'POST',198credentials: 'include',199headers: { 'Content-Type': 'application/json' },200body: JSON.stringify({201context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },202videoId: ${JSON.stringify(videoId)}203})204});205206if (!response.ok) {207return { error: 'InnerTube player API returned HTTP ' + response.status };208}209210const data = await response.json();211const renderer = data.captions?.playerCaptionsTracklistRenderer;212if (!renderer?.captionTracks?.length) {213return { error: 'No captions available for this video' };214}215216const tracks = renderer.captionTracks;217const track = tracks.find((item) => item.kind !== 'asr') || tracks[0];218219return {220captionUrl: track.baseUrl,221language: track.languageCode,222kind: track.kind || 'manual',223available: tracks.map((item) => {224const languageLabel = readText(item.name) || item.languageCode;225return item.kind === 'asr'226? languageLabel + ' [' + item.languageCode + ', auto]'227: languageLabel + ' [' + item.languageCode + ']';228}),229title,230author,231authorUrl,232channelId: typeof videoDetails.channelId === 'string' ? videoDetails.channelId : undefined,233description,234publishedAt:235(typeof microformat.publishDate === 'string' && microformat.publishDate) ||236(typeof microformat.uploadDate === 'string' && microformat.uploadDate) ||237document.querySelector('meta[itemprop="datePublished"]')?.getAttribute('content') ||238undefined,239viewCount: parsePositiveInteger(videoDetails.viewCount) ?? parsePositiveInteger(microformat.viewCount),240durationSeconds: parsePositiveInteger(videoDetails.lengthSeconds),241keywords,242category: typeof microformat.category === 'string' ? microformat.category : undefined,243isLiveContent: Boolean(videoDetails.isLiveContent || microformat.isLiveContent),244coverImages: thumbnails,245};246})()247`);248249if ("error" in captionInfo) {250context.log.debug(`YouTube transcript unavailable: ${captionInfo.error}`);251return null;252}253254const segments = await context.browser.evaluate<YouTubeTranscriptSegment[] | { error: string }>(`255(async () => {256const response = await fetch(${JSON.stringify(captionInfo.captionUrl)});257const xml = await response.text();258if (!xml) {259return { error: 'Caption XML is empty' };260}261262function getAttr(tag, name) {263const needle = name + '="';264const index = tag.indexOf(needle);265if (index === -1) return '';266const valueStart = index + needle.length;267const valueEnd = tag.indexOf('"', valueStart);268if (valueEnd === -1) return '';269return tag.substring(valueStart, valueEnd);270}271272function decodeEntities(value) {273return value274.replaceAll('&', '&')275.replaceAll('<', '<')276.replaceAll('>', '>')277.replaceAll('"', '"')278.replaceAll(''', "'");279}280281const marker = xml.includes('<p t="') ? '<p ' : '<text ';282const endMarker = marker === '<p ' ? '</p>' : '</text>';283const results = [];284let position = 0;285286while (true) {287const tagStart = xml.indexOf(marker, position);288if (tagStart === -1) break;289let contentStart = xml.indexOf('>', tagStart);290if (contentStart === -1) break;291contentStart += 1;292const tagEnd = xml.indexOf(endMarker, contentStart);293if (tagEnd === -1) break;294295const attrString = xml.substring(tagStart + marker.length, contentStart - 1);296const content = xml.substring(contentStart, tagEnd);297const start = marker === '<p '298? (parseFloat(getAttr(attrString, 't')) || 0) / 1000299: (parseFloat(getAttr(attrString, 'start')) || 0);300const duration = marker === '<p '301? (parseFloat(getAttr(attrString, 'd')) || 0) / 1000302: (parseFloat(getAttr(attrString, 'dur')) || 0);303const text = decodeEntities(content.replace(/<[^>]+>/g, '')).split('\\n').join(' ').trim();304if (text) {305results.push({ start, end: start + duration, text });306}307308position = tagEnd + endMarker.length;309}310311if (results.length === 0) {312return { error: 'Parsed 0 transcript segments' };313}314return results;315})()316`);317318if (!Array.isArray(segments) || segments.length === 0) {319context.log.debug("Parsed no YouTube transcript segments.");320return null;321}322323const extractedChapters = await context.browser.evaluate<YouTubeChapter[]>(`324(() => {325const data = window.ytInitialData;326const markers = data?.playerOverlays?.playerOverlayRenderer327?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer328?.playerBar?.multiMarkersPlayerBarRenderer?.markersMap || [];329const results = [];330331for (const marker of markers) {332const chapters = marker?.value?.chapters;333if (!Array.isArray(chapters)) continue;334for (const chapter of chapters) {335const renderer = chapter?.chapterRenderer;336const title = renderer?.title?.simpleText;337const timeRangeStartMillis = renderer?.timeRangeStartMillis;338if (title && typeof timeRangeStartMillis === 'number') {339results.push({ title, time: Math.floor(timeRangeStartMillis / 1000) });340}341}342}343344return results;345})()346`).catch(() => []);347348const descriptionChapters = parseYouTubeDescriptionChapters(captionInfo.description);349const chapters = extractedChapters.length > 0 ? extractedChapters : descriptionChapters;350const markdown = renderYouTubeTranscriptMarkdown({351description: captionInfo.description,352segments,353chapters,354});355356if (!markdown) {357return null;358}359360const pageUrl = await context.browser.getURL();361const coverImage = await resolveBestCoverImage(videoId, captionInfo.coverImages);362const summary = buildSummary(captionInfo.description, segments);363364return {365url: pageUrl,366canonicalUrl: pageUrl,367title: captionInfo.title || "YouTube Transcript",368author: captionInfo.author,369publishedAt: captionInfo.publishedAt,370siteName: "YouTube",371summary,372adapter: "youtube",373metadata: {374kind: "youtube/transcript",375videoId,376authorUrl: normalizeUrl(captionInfo.authorUrl),377channelId: captionInfo.channelId,378coverImage,379description: captionInfo.description,380durationSeconds: captionInfo.durationSeconds,381language: captionInfo.language,382captionKind: captionInfo.kind,383availableLanguages: captionInfo.available,384viewCount: captionInfo.viewCount,385keywords: captionInfo.keywords,386category: captionInfo.category,387isLiveContent: captionInfo.isLiveContent,388chapterCount: chapters.length,389},390content: [{ type: "markdown", markdown }],391};392}393