Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Translate articles and documents between languages in three modes: quick, normal, and refined publication-quality.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/chunk.ts
1import { mkdirSync, readFileSync, writeFileSync } from "fs"2import { dirname, join } from "path"3import MarkdownIt from "markdown-it"45type BlockKind =6| "heading"7| "thematicBreak"8| "html"9| "code"10| "flow"1112interface Block {13kind: BlockKind14md: string15words: number16}1718interface Chunk {19blocks: Block[]20words: number21}2223export interface ChunkCliOptions {24file: string25maxWords: number26outputDir: string27}2829export interface ChunkResult {30source: string31chunks: number32output_dir: string33frontmatter: boolean34words_per_chunk: number[]35}3637const parser = new MarkdownIt({ html: true })3839export function formatChunkUsage(command: string): string {40return `Usage: ${command} <file> [--max-words 5000] [--output-dir <dir>]`41}4243export function runChunkCli(args: string[], command = "chunk.ts"): number {44const parsed = parseChunkCliArgs(args)4546if ("help" in parsed) {47console.log(formatChunkUsage(command))48return 049}5051if ("error" in parsed) {52console.error(parsed.error)53console.error(formatChunkUsage(command))54return 155}5657const result = chunkMarkdownFile(parsed.file, {58maxWords: parsed.maxWords,59outputDir: parsed.outputDir,60})6162console.log(JSON.stringify(result))63return 064}6566export function chunkMarkdownFile(67file: string,68options: { maxWords?: number; outputDir?: string } = {}69): ChunkResult {70const maxWords = options.maxWords ?? 500071const outputDir = options.outputDir ?? ""7273const rawContent = normalizeNewlines(readFileSync(file, "utf-8"))74const { frontmatter, body } = extractFrontmatter(rawContent)75const chunks = buildChunks(parseMarkdown(body), maxWords)7677const dir = outputDir ? join(outputDir, "chunks") : join(dirname(file), "chunks")78mkdirSync(dir, { recursive: true })7980if (frontmatter) {81writeFileSync(join(dir, "frontmatter.md"), frontmatter)82}8384chunks.forEach((chunk, index) => {85const num = String(index + 1).padStart(2, "0")86writeFileSync(join(dir, `chunk-${num}.md`), chunk.blocks.map(block => block.md).join("\n\n"))87})8889return {90source: file,91chunks: chunks.length,92output_dir: dir,93frontmatter: Boolean(frontmatter),94words_per_chunk: chunks.map(chunk => chunk.words),95}96}9798function parseChunkCliArgs(args: string[]):99| ChunkCliOptions100| { help: true }101| { error: string } {102let file = ""103let maxWords = 5000104let outputDir = ""105106for (let index = 0; index < args.length; index += 1) {107const arg = args[index]108109if (arg === "-h" || arg === "--help") {110return { help: true }111}112113if (arg === "--max-words") {114const value = args[index + 1]115if (!value) return { error: "Missing value for --max-words" }116maxWords = parsePositiveInt(value, 0)117if (maxWords <= 0) return { error: `Invalid --max-words value: ${value}` }118index += 1119continue120}121122if (arg === "--output-dir") {123const value = args[index + 1]124if (!value) return { error: "Missing value for --output-dir" }125outputDir = value126index += 1127continue128}129130if (arg.startsWith("-")) {131return { error: `Unknown option: ${arg}` }132}133134if (!file) {135file = arg136continue137}138139return { error: `Unexpected positional argument: ${arg}` }140}141142if (!file) {143return { error: "Missing input file" }144}145146return { file, maxWords, outputDir }147}148149function parsePositiveInt(value: string | undefined, fallback: number): number {150if (!value) return fallback151const parsed = Number.parseInt(value, 10)152return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback153}154155function normalizeNewlines(text: string): string {156return text.replace(/^\uFEFF/, "").replace(/\r\n?/g, "\n")157}158159function trimBoundaryBlankLines(text: string): string {160return text.replace(/^\n+/, "").replace(/\n+$/, "")161}162163function extractFrontmatter(content: string): { frontmatter: string; body: string } {164const lines = content.split("\n")165if (lines[0] !== "---") {166return { frontmatter: "", body: content }167}168169for (let index = 1; index < lines.length; index += 1) {170if (lines[index] === "---" || lines[index] === "...") {171return {172frontmatter: lines.slice(0, index + 1).join("\n"),173body: lines.slice(index + 1).join("\n").replace(/^\n+/, ""),174}175}176}177178return { frontmatter: "", body: content }179}180181function parseMarkdown(content: string): Block[] {182if (!content.trim()) return []183184const lines = content.split("\n")185const tokens = parser.parse(content, {})186const blocks: Block[] = []187188for (const token of tokens) {189if (!token.map || token.level !== 0) continue190if (token.nesting !== 1 && token.nesting !== 0) continue191192const [startLine, endLine] = token.map193const md = trimBoundaryBlankLines(lines.slice(startLine, endLine).join("\n"))194if (!md) continue195196blocks.push(makeBlock(tokenTypeToBlockKind(token.type), md))197}198199if (blocks.length === 0) {200const body = trimBoundaryBlankLines(content)201if (body) {202blocks.push(makeBlock("flow", body))203}204}205206return blocks207}208209function tokenTypeToBlockKind(tokenType: string): BlockKind {210if (tokenType === "heading_open") return "heading"211if (tokenType === "hr") return "thematicBreak"212if (tokenType === "html_block") return "html"213if (tokenType === "fence" || tokenType === "code_block") return "code"214return "flow"215}216217function makeBlock(kind: BlockKind, md: string): Block {218return {219kind,220md: trimBoundaryBlankLines(md),221words: countWords(md),222}223}224225function buildChunks(blocks: Block[], maxWordsPerChunk: number): Chunk[] {226const sections = splitIntoSections(blocks)227const normalizedBlocks: Block[] = []228229for (const section of sections) {230const sectionWords = section.reduce((sum, block) => sum + block.words, 0)231if (sectionWords <= maxWordsPerChunk) {232normalizedBlocks.push(makeBlock("flow", section.map(block => block.md).join("\n\n")))233continue234}235236for (const block of section) {237normalizedBlocks.push(...splitOversizedBlock(block, maxWordsPerChunk))238}239}240241const chunks: Chunk[] = []242let currentBlocks: Block[] = []243let currentWords = 0244245for (const block of normalizedBlocks) {246if (currentWords + block.words > maxWordsPerChunk && currentBlocks.length > 0) {247chunks.push({ blocks: currentBlocks, words: currentWords })248currentBlocks = [block]249currentWords = block.words250continue251}252253currentBlocks.push(block)254currentWords += block.words255}256257if (currentBlocks.length > 0) {258chunks.push({ blocks: currentBlocks, words: currentWords })259}260261return chunks262}263264function splitIntoSections(blocks: Block[]): Block[][] {265const sections: Block[][] = []266let current: Block[] = []267268for (const block of blocks) {269if (block.kind === "heading" && current.length > 0) {270sections.push(current)271current = [block]272continue273}274275current.push(block)276}277278if (current.length > 0) {279sections.push(current)280}281282return sections283}284285function splitOversizedBlock(block: Block, maxWordsPerChunk: number): Block[] {286if (block.words <= maxWordsPerChunk) return [block]287288if (289block.kind === "heading"290|| block.kind === "thematicBreak"291|| block.kind === "html"292|| block.kind === "code"293) {294return [block]295}296297const lines = block.md.split("\n")298if (lines.length <= 1) {299return [block]300}301302const splitBlocks: Block[] = []303let buffer: string[] = []304let bufferWords = 0305306for (const line of lines) {307const lineWords = countWords(line)308if (bufferWords + lineWords > maxWordsPerChunk && buffer.length > 0) {309splitBlocks.push(makeBlock(block.kind, buffer.join("\n")))310buffer = [line]311bufferWords = lineWords312continue313}314315buffer.push(line)316bufferWords += lineWords317}318319if (buffer.length > 0) {320splitBlocks.push(makeBlock(block.kind, buffer.join("\n")))321}322323return splitBlocks324}325326function countWords(text: string): number {327const cleaned = text.replace(/[#*`\[\]()>|_~-]/g, " ")328const cjk = cleaned.match(/[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]/g)329const latin = cleaned.match(/[a-zA-Z0-9]+/g)330return (cjk?.length ?? 0) + (latin?.length ?? 0)331}332333if (import.meta.main) {334process.exit(runChunkCli(process.argv.slice(2), process.argv[1] ?? "chunk.ts"))335}336