Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Capture a talking-head clip with camera and microphone, transcribe it with ElevenLabs word-level timestamps, detect immediate doubled words or stutters, render
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/transcribe_with_elevenlabs.py
1#!/usr/bin/env python32import argparse3import json4import os5import subprocess6import tempfile7from pathlib import Path8from urllib import request9from urllib.error import HTTPError101112def load_api_key(explicit: str | None, env_file: str | None) -> str:13if explicit:14return explicit1516for name in ("ELEVENLABS_API_KEY", "XI_API_KEY"):17value = os.getenv(name)18if value:19return value2021if env_file:22path = Path(env_file)23if path.exists():24for line in path.read_text().splitlines():25if line.startswith("ELEVENLABS_API_KEY=") or line.startswith("XI_API_KEY="):26return line.split("=", 1)[1].strip()2728raise SystemExit("Missing ELEVENLABS_API_KEY / XI_API_KEY")293031def ensure_wav(input_path: Path) -> tuple[Path, tempfile.TemporaryDirectory | None]:32if input_path.suffix.lower() == ".wav":33return input_path, None3435tmpdir = tempfile.TemporaryDirectory(prefix="record-transcribe-revoice-")36wav_path = Path(tmpdir.name) / f"{input_path.stem}.wav"37subprocess.run(38[39"ffmpeg",40"-hide_banner",41"-loglevel",42"error",43"-i",44str(input_path),45"-vn",46"-ac",47"1",48"-ar",49"16000",50str(wav_path),51"-y"52],53check=True54)55return wav_path, tmpdir565758def build_artifacts(payload: dict, transcript_path: Path) -> dict:59words = payload.get("words", [])60clean_text = payload.get("text", "").strip()6162sentences = []63current = []64for item in words:65if item.get("type") != "word":66continue67current.append(item)68if item.get("text", "").endswith((".", "?", "!")):69sentences.append(current)70current = []71if current:72sentences.append(current)7374sentence_rows = [75{76"index": index + 1,77"start": sentence[0]["start"],78"end": sentence[-1]["end"],79"text": " ".join(word["text"] for word in sentence)80}81for index, sentence in enumerate(sentences)82]8384pauses = []85previous_word = None86for item in words:87if item.get("type") != "word":88continue89if previous_word is not None:90gap = round(item["start"] - previous_word["end"], 3)91if gap >= 0.35:92pauses.append(93{94"after": previous_word["text"],95"before": item["text"],96"gap_sec": gap,97"from": previous_word["end"],98"to": item["start"]99}100)101previous_word = item102103clean_txt_path = transcript_path.with_suffix(".clean.txt")104clean_txt_path.write_text(clean_text + "\n")105sentences_path = transcript_path.with_suffix(".sentences.json")106sentences_path.write_text(json.dumps(sentence_rows, ensure_ascii=False, indent=2))107pauses_path = transcript_path.with_suffix(".pauses.json")108pauses_path.write_text(json.dumps(pauses, ensure_ascii=False, indent=2))109110return {111"clean_text": str(clean_txt_path),112"sentences": str(sentences_path),113"pauses": str(pauses_path)114}115116117def main() -> None:118parser = argparse.ArgumentParser()119parser.add_argument("--input", required=True)120parser.add_argument("--out-dir", default=".")121parser.add_argument("--api-key")122parser.add_argument("--env-file")123parser.add_argument("--model-id", default="scribe_v2")124parser.add_argument("--language-code", default="en")125parser.add_argument("--timestamps-granularity", default="word")126args = parser.parse_args()127128api_key = load_api_key(args.api_key, args.env_file)129input_path = Path(args.input).expanduser().resolve()130out_dir = Path(args.out_dir).expanduser().resolve()131out_dir.mkdir(parents=True, exist_ok=True)132133wav_path, tmpdir = ensure_wav(input_path)134135boundary = "----RecordTranscribeRevoiceBoundary"136fields = {137"model_id": args.model_id,138"timestamps_granularity": args.timestamps_granularity,139"language_code": args.language_code,140"tag_audio_events": "true",141"diarize": "false"142}143144body = bytearray()145for name, value in fields.items():146body.extend(f"--{boundary}\r\n".encode())147body.extend(f'Content-Disposition: form-data; name="{name}"\r\n\r\n{value}\r\n'.encode())148body.extend(f"--{boundary}\r\n".encode())149body.extend(b'Content-Disposition: form-data; name="file"; filename="input.wav"\r\n')150body.extend(b"Content-Type: audio/wav\r\n\r\n")151body.extend(wav_path.read_bytes())152body.extend(b"\r\n")153body.extend(f"--{boundary}--\r\n".encode())154155req = request.Request(156"https://api.elevenlabs.io/v1/speech-to-text",157data=bytes(body),158method="POST",159headers={160"xi-api-key": api_key,161"Content-Type": f"multipart/form-data; boundary={boundary}",162"Accept": "application/json"163}164)165166try:167with request.urlopen(req, timeout=1800) as response:168payload = json.loads(response.read().decode("utf-8"))169except HTTPError as exc:170detail = exc.read().decode("utf-8", errors="replace")171raise SystemExit(f"ElevenLabs transcription failed: HTTP {exc.code}\n{detail}")172finally:173if tmpdir is not None:174tmpdir.cleanup()175176transcript_path = out_dir / f"{input_path.stem}.elevenlabs.transcript.json"177transcript_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2))178artifacts = build_artifacts(payload, transcript_path)179180print(181json.dumps(182{183"ok": True,184"transcript": str(transcript_path),185"text_preview": payload.get("text", "")[:280],186**artifacts187},188ensure_ascii=False189)190)191192193if __name__ == "__main__":194main()195