Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Capture a talking-head clip with camera and microphone, transcribe it with ElevenLabs word-level timestamps, detect immediate doubled words or stutters, render
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/speech_to_speech_elevenlabs.py
1#!/usr/bin/env python32import argparse3import json4import os5from pathlib import Path6from urllib import request7from urllib.error import HTTPError8910def load_api_key(explicit: str | None, env_file: str | None) -> str:11if explicit:12return explicit1314for name in ("ELEVENLABS_API_KEY", "XI_API_KEY"):15value = os.getenv(name)16if value:17return value1819if env_file:20path = Path(env_file)21if path.exists():22for line in path.read_text().splitlines():23if line.startswith("ELEVENLABS_API_KEY=") or line.startswith("XI_API_KEY="):24return line.split("=", 1)[1].strip()2526raise SystemExit("Missing ELEVENLABS_API_KEY / XI_API_KEY")272829def resolve_voice_id(api_key: str, voice_name: str) -> str:30req = request.Request("https://api.elevenlabs.io/v1/voices", headers={"xi-api-key": api_key, "Accept": "application/json"})31with request.urlopen(req, timeout=120) as response:32payload = json.loads(response.read().decode("utf-8"))3334exact = None35fallback = None36for voice in payload.get("voices", []):37name = voice.get("name") or ""38if name.lower() == voice_name.lower():39exact = voice.get("voice_id")40break41if voice_name.lower() in name.lower():42fallback = voice.get("voice_id")43if exact:44return exact45if fallback:46return fallback47raise SystemExit(f"Could not resolve voice name: {voice_name}")484950def main() -> None:51parser = argparse.ArgumentParser()52parser.add_argument("--input-audio", required=True)53parser.add_argument("--output", required=True)54parser.add_argument("--voice-id")55parser.add_argument("--voice-name")56parser.add_argument("--api-key")57parser.add_argument("--env-file")58parser.add_argument("--model-id", default="eleven_multilingual_sts_v2")59parser.add_argument("--file-format", default="pcm_s16le_16")60parser.add_argument("--stability", type=float, default=0.5)61parser.add_argument("--similarity-boost", type=float, default=0.75)62parser.add_argument("--style", type=float, default=0.0)63parser.add_argument("--speed", type=float, default=1.0)64parser.add_argument("--seed", default="42")65args = parser.parse_args()6667api_key = load_api_key(args.api_key, args.env_file)68voice_id = args.voice_id or resolve_voice_id(api_key, args.voice_name or "")69input_audio = Path(args.input_audio).expanduser().resolve()70output_path = Path(args.output).expanduser().resolve()71output_path.parent.mkdir(parents=True, exist_ok=True)7273voice_settings = json.dumps(74{75"stability": args.stability,76"similarity_boost": args.similarity_boost,77"style": args.style,78"use_speaker_boost": True,79"speed": args.speed80}81)8283fields = {84"model_id": args.model_id,85"voice_settings": voice_settings,86"file_format": args.file_format,87"seed": str(args.seed),88"remove_background_noise": "false"89}9091boundary = "----RecordTranscribeRevoiceSTSBoundary"92body = bytearray()93for name, value in fields.items():94body.extend(f"--{boundary}\r\n".encode())95body.extend(f'Content-Disposition: form-data; name="{name}"\r\n\r\n{value}\r\n'.encode())96body.extend(f"--{boundary}\r\n".encode())97body.extend(b'Content-Disposition: form-data; name="audio"; filename="input.wav"\r\n')98body.extend(b"Content-Type: audio/wav\r\n\r\n")99body.extend(input_audio.read_bytes())100body.extend(b"\r\n")101body.extend(f"--{boundary}--\r\n".encode())102103req = request.Request(104f"https://api.elevenlabs.io/v1/speech-to-speech/{voice_id}",105data=bytes(body),106method="POST",107headers={108"xi-api-key": api_key,109"Content-Type": f"multipart/form-data; boundary={boundary}",110"Accept": "audio/mpeg"111}112)113114try:115with request.urlopen(req, timeout=1800) as response:116audio = response.read()117except HTTPError as exc:118detail = exc.read().decode("utf-8", errors="replace")119raise SystemExit(f"ElevenLabs speech-to-speech failed: HTTP {exc.code}\n{detail}")120121output_path.write_bytes(audio)122print(json.dumps({"ok": True, "output": str(output_path), "voice_id": voice_id, "bytes": len(audio)}))123124125if __name__ == "__main__":126main()127