Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Fetch any URL via Chrome CDP and convert the rendered page to clean markdown with YouTube transcript support.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/lib/commands/convert.ts
1import { mkdir, writeFile } from "node:fs/promises";2import { join } from "node:path";3import { createInterface } from "node:readline";4import { connectChrome, type ChromeConnection } from "../browser/chrome-launcher";5import { CdpClient } from "../browser/cdp-client";6import { detectInteractionGate } from "../browser/interaction-gates";7import { NetworkJournal } from "../browser/network-journal";8import { BrowserSession } from "../browser/session";9import { genericAdapter, resolveAdapter } from "../adapters";10import { isXSessionReady } from "../adapters/x/session";11import type { ExtractedDocument } from "../extract/document";12import { renderMarkdown } from "../extract/markdown-renderer";13import { downloadMediaAssets } from "../media/default-downloader";14import { rewriteMarkdownMediaLinks } from "../media/markdown-media";15import { createLogger } from "../utils/logger";16import { normalizeUrl } from "../utils/url";17import type {18Adapter,19AdapterContext,20AdapterLoginInfo,21LoginState,22MediaAsset,23WaitForInteractionRequest,24} from "../adapters/types";2526export type WaitMode = "none" | "interaction" | "force";27export type OutputFormat = "markdown" | "json";2829export interface ConvertCommandOptions {30url?: string;31output?: string;32format: OutputFormat;33adapter?: string;34debugDir?: string;35cdpUrl?: string;36browserPath?: string;37chromeProfileDir?: string;38headless: boolean;39downloadMedia: boolean;40mediaDir?: string;41waitMode: WaitMode;42interactionTimeoutMs: number;43interactionPollIntervalMs: number;44timeoutMs: number;45}4647interface RuntimeResources {48chrome: ChromeConnection;49cdp: CdpClient;50browser: BrowserSession;51network: NetworkJournal;52interactive: boolean;53}5455interface ForceWaitSnapshot {56url: string;57hasGate: boolean;58loginState: LoginState | "unavailable";59sessionReady: boolean;60}6162interface SuccessfulConvertOutput {63adapter: string;64status: "ok";65login?: AdapterLoginInfo;66media: MediaAsset[];67downloads: Awaited<ReturnType<typeof downloadMediaAssets>> | null;68document: ExtractedDocument;69markdown: string;70}7172interface InteractionRequiredOutput {73adapter: string;74status: "needs_interaction";75login?: AdapterLoginInfo;76interaction: WaitForInteractionRequest;77}7879function sleep(ms: number): Promise<void> {80return new Promise((resolve) => setTimeout(resolve, ms));81}8283function isForceWaitSessionReady(snapshot: ForceWaitSnapshot): boolean {84return snapshot.sessionReady;85}8687export function shouldKeepBrowserOpenAfterInteraction(options: {88launched: boolean;89interaction: Pick<WaitForInteractionRequest, "kind" | "provider">;90}): boolean {91return options.launched && options.interaction.kind === "login" && options.interaction.provider === "x";92}9394export function shouldAutoContinueForceWait(95initial: ForceWaitSnapshot,96current: ForceWaitSnapshot,97): boolean {98if (initial.hasGate && !current.hasGate) {99return true;100}101102if (initial.loginState === "logged_out" && current.loginState !== "logged_out" && isForceWaitSessionReady(current)) {103return true;104}105106if (initial.loginState !== "logged_in" && current.loginState === "logged_in" && isForceWaitSessionReady(current)) {107return true;108}109110if (111current.url !== initial.url &&112!current.hasGate &&113current.loginState !== "logged_out" &&114isForceWaitSessionReady(current)115) {116return true;117}118119return false;120}121122async function writeOutput(path: string, content: string): Promise<void> {123const directory = path.includes("/") ? path.slice(0, path.lastIndexOf("/")) : "";124if (directory) {125await mkdir(directory, { recursive: true });126}127await writeFile(path, content, "utf8");128}129130async function writeDebugArtifacts(131debugDir: string,132document: ExtractedDocument,133markdown: string,134browser: BrowserSession,135network: NetworkJournal,136): Promise<void> {137await mkdir(debugDir, { recursive: true });138139const html = await browser.getHTML().catch(() => "");140const networkDump = await network.toJSON({ includeBodies: true });141142await Promise.all([143writeFile(join(debugDir, "document.json"), JSON.stringify(document, null, 2), "utf8"),144writeFile(join(debugDir, "markdown.md"), markdown, "utf8"),145writeFile(join(debugDir, "page.html"), html, "utf8"),146writeFile(join(debugDir, "network.json"), JSON.stringify(networkDump, null, 2), "utf8"),147]);148}149150async function openRuntime(151options: ConvertCommandOptions,152interactive: boolean,153debugEnabled: boolean,154): Promise<RuntimeResources> {155const logger = createLogger(debugEnabled);156if (interactive) {157logger.info("Opening Chrome in interactive mode.");158}159const chrome = await connectChrome({160cdpUrl: options.cdpUrl,161browserPath: options.browserPath,162profileDir: options.chromeProfileDir,163headless: interactive ? false : options.headless,164logger,165});166167const cdp = await CdpClient.connect(chrome.browserWsUrl);168const browser = await BrowserSession.open(cdp, { interactive });169if (interactive) {170await browser.bringToFront().catch(() => {});171}172const network = new NetworkJournal(browser.targetSession, logger);173await network.start();174175return {176chrome,177cdp,178browser,179network,180interactive,181};182}183184async function closeRuntime(runtime: RuntimeResources | null | undefined): Promise<void> {185if (!runtime) {186return;187}188runtime.network.stop();189await runtime.browser.close().catch(() => {});190await runtime.cdp.close().catch(() => {});191await runtime.chrome.close().catch(() => {});192}193194async function isInteractionSessionReady(195context: AdapterContext,196interaction: WaitForInteractionRequest,197): Promise<boolean> {198if (interaction.provider !== "x") {199return true;200}201return await isXSessionReady(context).catch(() => false);202}203204async function reopenInteractiveRuntime(205runtime: RuntimeResources,206options: ConvertCommandOptions,207debugEnabled: boolean,208): Promise<RuntimeResources> {209if (runtime.interactive) {210return runtime;211}212213await closeRuntime(runtime);214return openRuntime(options, true, debugEnabled);215}216217async function captureForceWaitSnapshot(218adapter: Adapter,219context: AdapterContext,220): Promise<ForceWaitSnapshot> {221const [gate, url, login] = await Promise.all([222detectInteractionGate(context.browser).catch(() => null),223context.browser.getURL().catch(() => context.input.url.toString()),224adapter.checkLogin?.(context).catch(() => ({225provider: adapter.name,226state: "unknown" as const,227})),228]);229230return {231url,232hasGate: Boolean(gate),233loginState: login?.state ?? "unavailable",234sessionReady: adapter.name === "x" ? await isXSessionReady(context).catch(() => false) : true,235};236}237238async function waitForForceResume(239adapter: Adapter,240context: AdapterContext,241options: ConvertCommandOptions,242): Promise<void> {243if (context.interactive) {244await context.browser.bringToFront().catch(() => {});245}246247const prompt =248"Chrome is ready. Complete any manual login or verification. Extraction will continue automatically after it detects progress, or press Enter to continue immediately.";249context.log.info(prompt);250251const rl = createInterface({252input: process.stdin,253output: process.stderr,254});255256let manualContinue = false;257let closed = false;258const closeReadline = (): void => {259if (!closed) {260closed = true;261rl.close();262}263};264265rl.once("line", () => {266manualContinue = true;267closeReadline();268});269270const initial = await captureForceWaitSnapshot(adapter, context);271const startedAt = Date.now();272273try {274while (Date.now() - startedAt < options.interactionTimeoutMs) {275if (manualContinue) {276return;277}278279const current = await captureForceWaitSnapshot(adapter, context);280if (shouldAutoContinueForceWait(initial, current)) {281return;282}283284await sleep(options.interactionPollIntervalMs);285}286} finally {287closeReadline();288}289290throw new Error("Timed out waiting for force-mode interaction to complete");291}292293async function waitForInteraction(294adapter: Adapter,295context: AdapterContext,296interaction: WaitForInteractionRequest,297options: ConvertCommandOptions,298): Promise<AdapterLoginInfo> {299const timeoutMs = interaction.timeoutMs ?? options.interactionTimeoutMs;300const pollIntervalMs = interaction.pollIntervalMs ?? options.interactionPollIntervalMs;301if (context.interactive) {302await context.browser.bringToFront().catch(() => {});303}304context.log.info(interaction.prompt);305306const startedAt = Date.now();307let lastLogin: AdapterLoginInfo | null = null;308309while (Date.now() - startedAt < timeoutMs) {310if (interaction.kind === "login" && adapter.checkLogin) {311lastLogin = await adapter.checkLogin(context);312if (lastLogin.state === "logged_in" && await isInteractionSessionReady(context, interaction)) {313return lastLogin;314}315}316317const gate = await detectInteractionGate(context.browser);318if (!gate) {319if (interaction.kind !== "login") {320return lastLogin ?? {321provider: interaction.provider,322state: "unknown",323reason: `${interaction.provider} challenge cleared`,324};325}326327if (!adapter.checkLogin) {328return {329provider: interaction.provider,330state: "unknown",331};332}333334lastLogin = await adapter.checkLogin(context);335if (lastLogin.state !== "logged_out" && await isInteractionSessionReady(context, interaction)) {336return lastLogin;337}338}339await sleep(pollIntervalMs);340}341342const reason = lastLogin?.reason ? ` (${lastLogin.reason})` : "";343throw new Error(`Timed out waiting for ${interaction.provider} interaction${reason}`);344}345346export function formatOutputContent(347format: OutputFormat,348payload: SuccessfulConvertOutput | InteractionRequiredOutput,349): string {350if (format === "json") {351return JSON.stringify(payload, null, 2);352}353354if (payload.status !== "ok") {355throw new Error("Markdown output is only available for successful extraction results");356}357358return payload.markdown;359}360361function printOutput(content: string): void {362process.stdout.write(content);363if (!content.endsWith("\n")) {364process.stdout.write("\n");365}366}367368export async function runConvertCommand(options: ConvertCommandOptions): Promise<void> {369if (!options.url) {370throw new Error("URL is required");371}372if (options.downloadMedia && !options.output) {373throw new Error("--download-media requires --output so media paths can be rewritten relative to the saved output file");374}375376const url = normalizeUrl(options.url);377let runtime = await openRuntime(options, options.waitMode !== "none", Boolean(options.debugDir));378const logger = createLogger(Boolean(options.debugDir));379let didLogin = false;380let adapter: Adapter | null = null;381let context: AdapterContext | null = null;382383try {384adapter = resolveAdapter({ url }, options.adapter);385context = {386input: { url },387browser: runtime.browser,388network: runtime.network,389cdp: runtime.cdp,390log: logger,391outputFormat: options.format,392timeoutMs: options.timeoutMs,393interactive: runtime.interactive,394downloadMedia: options.downloadMedia,395};396397if (adapter.restoreCookies) {398const restored = await adapter.restoreCookies(context, runtime.chrome.profileDir).catch(() => false);399if (restored) logger.info(`Restored ${adapter.name} session cookies from sidecar.`);400}401402if (options.waitMode === "interaction" && adapter.checkLogin) {403await context.browser.goto(url.toString(), options.timeoutMs).catch(() => {});404const preLogin = await adapter.checkLogin(context);405if (preLogin.state !== "logged_in") {406didLogin = true;407await waitForInteraction(adapter, context, {408type: "wait_for_interaction",409kind: "login",410provider: preLogin.provider ?? adapter.name,411prompt: `Please sign in to ${adapter.name === "x" ? "X" : adapter.name} in the opened Chrome window. Extraction will continue automatically once login is detected.`,412reason: preLogin.reason ?? `Not logged in to ${adapter.name}`,413requiresVisibleBrowser: true,414}, options);415}416}417418if (options.waitMode === "force") {419await context.browser.goto(url.toString(), options.timeoutMs).catch(() => {});420await waitForForceResume(adapter, context, options);421}422423let result = await adapter.process(context);424425if (result.status === "no_document") {426const interaction = await detectInteractionGate(context.browser);427if (interaction) {428result = {429status: "needs_interaction",430interaction,431login: result.login,432};433}434}435436while (result.status === "needs_interaction") {437if (options.waitMode === "none") {438if (options.format === "json") {439printOutput(440formatOutputContent(options.format, {441adapter: adapter.name,442status: result.status,443login: result.login,444interaction: result.interaction,445}),446);447return;448}449450throw new Error(`${adapter.name} requires manual interaction. Re-run with --wait-for interaction to continue after completing it.`);451}452453if (result.interaction.requiresVisibleBrowser !== false) {454runtime = await reopenInteractiveRuntime(runtime, options, Boolean(options.debugDir));455}456457context = {458input: { url },459browser: runtime.browser,460network: runtime.network,461cdp: runtime.cdp,462log: logger,463outputFormat: options.format,464timeoutMs: options.timeoutMs,465interactive: runtime.interactive,466downloadMedia: options.downloadMedia,467};468469await context.browser.goto(url.toString(), options.timeoutMs).catch(() => {});470if (result.interaction.kind === "login") {471didLogin = true;472}473await waitForInteraction(adapter, context, result.interaction, options);474result = await adapter.process(context);475476if (result.status === "no_document") {477const interaction = await detectInteractionGate(context.browser);478if (interaction) {479result = {480status: "needs_interaction",481interaction,482login: result.login,483};484}485}486}487488let document: ExtractedDocument | null = result.status === "ok" ? result.document : null;489let media: MediaAsset[] = result.status === "ok" ? (result.media ?? []) : [];490let login = result.login;491let mediaAdapter = adapter;492493if (!document && adapter.name !== genericAdapter.name && result.status === "no_document") {494logger.info(`Adapter ${adapter.name} returned no structured document; falling back to generic extraction`);495const fallback = await genericAdapter.process(context);496if (fallback.status === "ok") {497document = fallback.document;498media = fallback.media ?? [];499mediaAdapter = genericAdapter;500}501}502503if (!document) {504throw new Error("Failed to extract a document from the target URL");505}506507document.requestedUrl ??= url.toString();508509let markdown = renderMarkdown(document);510let downloadResult:511| Awaited<ReturnType<typeof downloadMediaAssets>>512| null = null;513514if (options.downloadMedia && options.output) {515downloadResult = mediaAdapter.downloadMedia516? await mediaAdapter.downloadMedia({517media,518outputPath: options.output,519mediaDir: options.mediaDir,520log: logger,521})522: await downloadMediaAssets({523media,524outputPath: options.output,525mediaDir: options.mediaDir,526log: logger,527});528529markdown = rewriteMarkdownMediaLinks(markdown, downloadResult.replacements);530if (downloadResult.downloadedImages > 0 || downloadResult.downloadedVideos > 0) {531logger.info(532`Downloaded ${downloadResult.downloadedImages} images and ${downloadResult.downloadedVideos} videos`,533);534}535}536537if (options.output) {538await writeOutput(539options.output,540formatOutputContent(options.format, {541adapter: document.adapter ?? adapter.name,542status: "ok",543login,544media,545downloads: downloadResult,546document,547markdown,548}),549);550logger.info(`Saved ${options.format} to ${options.output}`);551}552553if (options.debugDir) {554await writeDebugArtifacts(options.debugDir, document, markdown, runtime.browser, runtime.network);555logger.info(`Wrote debug artifacts to ${options.debugDir}`);556}557558if (options.format === "json") {559printOutput(560formatOutputContent(options.format, {561adapter: document.adapter ?? adapter.name,562status: "ok",563login,564media,565downloads: downloadResult,566document,567markdown,568}),569);570return;571}572573printOutput(markdown);574} finally {575if (adapter?.exportCookies && context) {576await adapter.exportCookies(context, runtime.chrome.profileDir).catch(() => {});577}578await closeRuntime(runtime);579}580}581