Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Capture a talking-head clip with camera and microphone, transcribe it with ElevenLabs word-level timestamps, detect immediate doubled words or stutters, render
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/build_destutter_preview.py
1#!/usr/bin/env python32import argparse3import json4import subprocess5from pathlib import Path678def normalize_token(text: str) -> str:9return text.strip().strip(".,!?;:'\"").lower()101112def build_duplicate_intervals(words: list[dict], threshold: float) -> list[dict]:13spoken_words = [item for item in words if item.get("type") == "word"]14intervals = []1516for index in range(len(spoken_words) - 1):17first = spoken_words[index]18second = spoken_words[index + 1]19if normalize_token(first["text"]) != normalize_token(second["text"]):20continue21if second["start"] - first["end"] > threshold:22continue2324next_start = spoken_words[index + 2]["start"] if index + 2 < len(spoken_words) else second["end"]25intervals.append(26{27"token": normalize_token(first["text"]),28"remove_from": first["end"],29"remove_to": next_start,30"first_start": first["start"],31"second_start": second["start"]32}33)3435merged = []36for interval in intervals:37if not merged or interval["remove_from"] > merged[-1]["remove_to"]:38merged.append(interval.copy())39else:40merged[-1]["remove_to"] = max(merged[-1]["remove_to"], interval["remove_to"])41return merged424344def main() -> None:45parser = argparse.ArgumentParser()46parser.add_argument("--media", required=True)47parser.add_argument("--transcript", required=True)48parser.add_argument("--output", required=True)49parser.add_argument("--duplicate-gap-threshold", type=float, default=0.75)50args = parser.parse_args()5152media_path = Path(args.media).expanduser().resolve()53transcript_path = Path(args.transcript).expanduser().resolve()54output_path = Path(args.output).expanduser().resolve()55output_path.parent.mkdir(parents=True, exist_ok=True)5657payload = json.loads(transcript_path.read_text())58intervals = build_duplicate_intervals(payload.get("words", []), args.duplicate_gap_threshold)59duplicates_path = output_path.with_suffix(".duplicates.json")60duplicates_path.write_text(json.dumps(intervals, ensure_ascii=False, indent=2))6162last_word_end = 0.063for item in payload.get("words", []):64if item.get("type") == "word":65last_word_end = max(last_word_end, item["end"])66total_end = round(last_word_end + 0.5, 3)6768keep_segments = []69cursor = 0.070for interval in intervals:71if interval["remove_from"] > cursor:72keep_segments.append((round(cursor, 3), round(interval["remove_from"], 3)))73cursor = interval["remove_to"]74if cursor < total_end:75keep_segments.append((round(cursor, 3), total_end))7677if not keep_segments:78raise SystemExit("No keep segments produced")7980filter_parts = []81concat_inputs = []82for index, (start, end) in enumerate(keep_segments):83filter_parts.append(f"[0:v]trim=start={start}:end={end},setpts=PTS-STARTPTS[v{index}]")84filter_parts.append(f"[0:a]atrim=start={start}:end={end},asetpts=PTS-STARTPTS[a{index}]")85concat_inputs.append(f"[v{index}][a{index}]")86filter_parts.append(f"{''.join(concat_inputs)}concat=n={len(keep_segments)}:v=1:a=1[v][a]")8788subprocess.run(89[90"ffmpeg",91"-hide_banner",92"-loglevel",93"error",94"-i",95str(media_path),96"-filter_complex",97";".join(filter_parts),98"-map",99"[v]",100"-map",101"[a]",102"-c:v",103"libx264",104"-preset",105"veryfast",106"-crf",107"20",108"-c:a",109"aac",110"-b:a",111"160k",112str(output_path),113"-y"114],115check=True116)117118print(119json.dumps(120{121"ok": True,122"output": str(output_path),123"duplicates": str(duplicates_path),124"removed_intervals": intervals125},126ensure_ascii=False127)128)129130131if __name__ == "__main__":132main()133