Source from bundle
Record Transcribe Revoice

Capture a talking-head clip with camera and microphone, transcribe it with ElevenLabs word-level timestamps, detect immediate doubled words or stutters, render
Костянтин@Latand
Files
Skill
0.9K
Size
35.6 KB
Entrypoint
SKILL.md
Format
folder
Open file
scripts/transcribe_with_elevenlabs.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code195 linesFree
scripts/transcribe_with_elevenlabs.py
1#!/usr/bin/env python3
2import argparse
3import json
4import os
5import subprocess
6import tempfile
7from pathlib import Path
8from urllib import request
9from urllib.error import HTTPError
10 
11 
12def load_api_key(explicit: str | None, env_file: str | None) -> str:
13    if explicit:
14        return explicit
15 
16    for name in ("ELEVENLABS_API_KEY", "XI_API_KEY"):
17        value = os.getenv(name)
18        if value:
19            return value
20 
21    if env_file:
22        path = Path(env_file)
23        if path.exists():
24            for line in path.read_text().splitlines():
25                if line.startswith("ELEVENLABS_API_KEY=") or line.startswith("XI_API_KEY="):
26                    return line.split("=", 1)[1].strip()
27 
28    raise SystemExit("Missing ELEVENLABS_API_KEY / XI_API_KEY")
29 
30 
31def ensure_wav(input_path: Path) -> tuple[Path, tempfile.TemporaryDirectory | None]:
32    if input_path.suffix.lower() == ".wav":
33        return input_path, None
34 
35    tmpdir = tempfile.TemporaryDirectory(prefix="record-transcribe-revoice-")
36    wav_path = Path(tmpdir.name) / f"{input_path.stem}.wav"
37    subprocess.run(
38        [
39            "ffmpeg",
40            "-hide_banner",
41            "-loglevel",
42            "error",
43            "-i",
44            str(input_path),
45            "-vn",
46            "-ac",
47            "1",
48            "-ar",
49            "16000",
50            str(wav_path),
51            "-y"
52        ],
53        check=True
54    )
55    return wav_path, tmpdir
56 
57 
58def build_artifacts(payload: dict, transcript_path: Path) -> dict:
59    words = payload.get("words", [])
60    clean_text = payload.get("text", "").strip()
61 
62    sentences = []
63    current = []
64    for item in words:
65        if item.get("type") != "word":
66            continue
67        current.append(item)
68        if item.get("text", "").endswith((".", "?", "!")):
69            sentences.append(current)
70            current = []
71    if current:
72        sentences.append(current)
73 
74    sentence_rows = [
75        {
76            "index": index + 1,
77            "start": sentence[0]["start"],
78            "end": sentence[-1]["end"],
79            "text": " ".join(word["text"] for word in sentence)
80        }
81        for index, sentence in enumerate(sentences)
82    ]
83 
84    pauses = []
85    previous_word = None
86    for item in words:
87        if item.get("type") != "word":
88            continue
89        if previous_word is not None:
90            gap = round(item["start"] - previous_word["end"], 3)
91            if gap >= 0.35:
92                pauses.append(
93                    {
94                        "after": previous_word["text"],
95                        "before": item["text"],
96                        "gap_sec": gap,
97                        "from": previous_word["end"],
98                        "to": item["start"]
99                    }
100                )
101        previous_word = item
102 
103    clean_txt_path = transcript_path.with_suffix(".clean.txt")
104    clean_txt_path.write_text(clean_text + "\n")
105    sentences_path = transcript_path.with_suffix(".sentences.json")
106    sentences_path.write_text(json.dumps(sentence_rows, ensure_ascii=False, indent=2))
107    pauses_path = transcript_path.with_suffix(".pauses.json")
108    pauses_path.write_text(json.dumps(pauses, ensure_ascii=False, indent=2))
109 
110    return {
111        "clean_text": str(clean_txt_path),
112        "sentences": str(sentences_path),
113        "pauses": str(pauses_path)
114    }
115 
116 
117def main() -> None:
118    parser = argparse.ArgumentParser()
119    parser.add_argument("--input", required=True)
120    parser.add_argument("--out-dir", default=".")
121    parser.add_argument("--api-key")
122    parser.add_argument("--env-file")
123    parser.add_argument("--model-id", default="scribe_v2")
124    parser.add_argument("--language-code", default="en")
125    parser.add_argument("--timestamps-granularity", default="word")
126    args = parser.parse_args()
127 
128    api_key = load_api_key(args.api_key, args.env_file)
129    input_path = Path(args.input).expanduser().resolve()
130    out_dir = Path(args.out_dir).expanduser().resolve()
131    out_dir.mkdir(parents=True, exist_ok=True)
132 
133    wav_path, tmpdir = ensure_wav(input_path)
134 
135    boundary = "----RecordTranscribeRevoiceBoundary"
136    fields = {
137        "model_id": args.model_id,
138        "timestamps_granularity": args.timestamps_granularity,
139        "language_code": args.language_code,
140        "tag_audio_events": "true",
141        "diarize": "false"
142    }
143 
144    body = bytearray()
145    for name, value in fields.items():
146        body.extend(f"--{boundary}\r\n".encode())
147        body.extend(f'Content-Disposition: form-data; name="{name}"\r\n\r\n{value}\r\n'.encode())
148    body.extend(f"--{boundary}\r\n".encode())
149    body.extend(b'Content-Disposition: form-data; name="file"; filename="input.wav"\r\n')
150    body.extend(b"Content-Type: audio/wav\r\n\r\n")
151    body.extend(wav_path.read_bytes())
152    body.extend(b"\r\n")
153    body.extend(f"--{boundary}--\r\n".encode())
154 
155    req = request.Request(
156        "https://api.elevenlabs.io/v1/speech-to-text",
157        data=bytes(body),
158        method="POST",
159        headers={
160            "xi-api-key": api_key,
161            "Content-Type": f"multipart/form-data; boundary={boundary}",
162            "Accept": "application/json"
163        }
164    )
165 
166    try:
167        with request.urlopen(req, timeout=1800) as response:
168            payload = json.loads(response.read().decode("utf-8"))
169    except HTTPError as exc:
170        detail = exc.read().decode("utf-8", errors="replace")
171        raise SystemExit(f"ElevenLabs transcription failed: HTTP {exc.code}\n{detail}")
172    finally:
173        if tmpdir is not None:
174            tmpdir.cleanup()
175 
176    transcript_path = out_dir / f"{input_path.stem}.elevenlabs.transcript.json"
177    transcript_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2))
178    artifacts = build_artifacts(payload, transcript_path)
179 
180    print(
181        json.dumps(
182            {
183                "ok": True,
184                "transcript": str(transcript_path),
185                "text_preview": payload.get("text", "")[:280],
186                **artifacts
187            },
188            ensure_ascii=False
189        )
190    )
191 
192 
193if __name__ == "__main__":
194    main()
195
Preparing the source view

Record Transcribe Revoice

scripts/transcribe_with_elevenlabs.py