Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from bundle
Build or revise a reusable FFmpeg timeline for short-form video editing. Use when the user wants an agent-editable API for trimming clips, cropping, fitting to
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
scripts/video_timeline_editor/domain/services.py
1from __future__ import annotations23from .model import CaptionTrack, Clip, ClipTiming, CropBox, MediaInfo, WordToken456DEFAULT_OVERLAY_STYLES = {7"default": {8"fontsize": 42,9"fontcolor": "white",10"borderw": 2,11"bordercolor": "black",12"x": "(w-text_w)/2",13"y": "h-180",14},15"header": {16"fontsize": 42,17"fontcolor": "white",18"box": 1,19"boxcolor": "[email protected]",20"boxborderw": 16,21"x": "(w-text_w)/2",22"y": "120",23},24}2526DEFAULT_CAPTION_STYLES = {27"karaoke": {28"fontname": "DejaVu Sans",29"fontsize": 52,30"primary_color": "&H00FFFFFF",31"secondary_color": "&H000095FF",32"outline_color": "&H00000000",33"back_color": "&H96000000",34"bold": -1,35"italic": 0,36"underline": 0,37"strikeout": 0,38"scale_x": 100,39"scale_y": 100,40"spacing": 0,41"angle": 0,42"border_style": 1,43"outline": 3,44"shadow": 1,45"alignment": 2,46"margin_l": 10,47"margin_r": 10,48"margin_v": 350,49}50}515253def merge_styles(base_styles: dict[str, dict], extra_styles: dict[str, dict] | None) -> dict[str, dict]:54merged = {name: dict(values) for name, values in base_styles.items()}55for name, values in (extra_styles or {}).items():56current = dict(merged.get(name, {}))57current.update(values)58merged[name] = current59return merged606162def validate_crop(crop: CropBox) -> None:63for key in ("x", "y", "width", "height"):64value = float(getattr(crop, key))65if value < 0 or value > 1:66raise ValueError(f"crop_box {key} must be between 0 and 1, got {value}")67if crop.x + crop.width > 1.0 or crop.y + crop.height > 1.0:68raise ValueError(f"crop_box exceeds source frame: {crop}")697071def compute_clip_timing(clip: Clip, media: MediaInfo, transcript_words: tuple[WordToken, ...] = ()) -> ClipTiming:72nominal_in = 0.0 if clip.src_in is None else float(clip.src_in)73nominal_out = media.duration if clip.src_out is None else float(clip.src_out)74nominal_in += float(clip.trim_start or 0.0)75nominal_out -= float(clip.trim_end or 0.0)76if nominal_out <= nominal_in:77raise ValueError(f"{clip.id}: clip end must be after clip start")78if clip.speed <= 0:79raise ValueError(f"{clip.id}: speed must be positive")80if clip.transform.audio_tempo <= 0 or clip.transform.video_pts <= 0:81raise ValueError(f"{clip.id}: transform tempo values must be positive")8283actual_in = nominal_in84actual_out = nominal_out85word_aligned = False86if clip.captions and clip.captions.trim == "word" and transcript_words:87snapped = snap_bounds_to_words(nominal_in, nominal_out, transcript_words)88if snapped is not None:89actual_in, actual_out = snapped90word_aligned = True9192duration_in_source = actual_out - actual_in93video_time_scale = clip.transform.video_pts / clip.speed94audio_time_scale = 1.0 / (clip.transform.audio_tempo * clip.speed)95output_duration = duration_in_source * video_time_scale96return ClipTiming(97nominal_in=nominal_in,98nominal_out=nominal_out,99actual_in=actual_in,100actual_out=actual_out,101video_time_scale=video_time_scale,102audio_time_scale=audio_time_scale,103output_duration=output_duration,104word_aligned=word_aligned,105)106107108def snap_bounds_to_words(start: float, end: float, words: tuple[WordToken, ...]) -> tuple[float, float] | None:109start_candidates = [word.start for word in words if word.start >= start]110end_candidates = [word.end for word in words if word.end <= end]111if not start_candidates or not end_candidates:112return None113snapped_start = start_candidates[0]114snapped_end = end_candidates[-1]115if snapped_end <= snapped_start:116return None117return snapped_start, snapped_end118119120def retime_caption_words(words: tuple[WordToken, ...], timing: ClipTiming) -> tuple[WordToken, ...]:121clipped: list[WordToken] = []122for word in words:123if word.end <= timing.actual_in or word.start >= timing.actual_out:124continue125start = max(word.start, timing.actual_in)126end = min(word.end, timing.actual_out)127shifted_start = max(0.0, (start - timing.actual_in) * timing.video_time_scale)128shifted_end = max(shifted_start + 0.01, (end - timing.actual_in) * timing.video_time_scale)129clipped.append(WordToken(text=word.text, start=shifted_start, end=shifted_end))130return tuple(clipped)131132133def ass_timestamp(seconds: float) -> str:134hours = int(seconds // 3600)135minutes = int((seconds % 3600) // 60)136whole_seconds = int(seconds % 60)137centiseconds = int(round((seconds - int(seconds)) * 100))138if centiseconds == 100:139whole_seconds += 1140centiseconds = 0141return f"{hours}:{minutes:02d}:{whole_seconds:02d}.{centiseconds:02d}"142143144def escape_ass_text(text: str) -> str:145return text.replace("\\", r"\\").replace("{", r"\{").replace("}", r"\}")146147148def segment_words(words: tuple[WordToken, ...], max_words_per_cue: int) -> list[list[WordToken]]:149chunks: list[list[WordToken]] = []150chunk: list[WordToken] = []151for word in words:152chunk.append(word)153if len(chunk) >= max_words_per_cue or word.text.endswith((".", "!", "?")):154chunks.append(chunk)155chunk = []156if chunk:157chunks.append(chunk)158return chunks159160161def build_karaoke_ass(words: tuple[WordToken, ...], style: dict, width: int, height: int) -> str:162style_line = (163"Style: Karaoke,"164f"{style['fontname']},{style['fontsize']},{style['primary_color']},{style['secondary_color']},"165f"{style['outline_color']},{style['back_color']},{style['bold']},{style['italic']},"166f"{style['underline']},{style['strikeout']},{style['scale_x']},{style['scale_y']},"167f"{style['spacing']},{style['angle']},{style['border_style']},{style['outline']},"168f"{style['shadow']},{style['alignment']},{style['margin_l']},{style['margin_r']},{style['margin_v']}"169)170header = (171"[Script Info]\n"172"ScriptType: v4.00+\n"173f"PlayResX: {width}\n"174f"PlayResY: {height}\n"175"WrapStyle: 0\n\n"176"[V4+ Styles]\n"177f"{style_line}\n\n"178"[Events]\n"179"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"180)181if not words:182return header183184events: list[str] = []185previous_end = 0.0186for chunk in segment_words(words, max(1, int(style.get("max_words_per_cue", 4)))):187start = max(chunk[0].start, previous_end)188end = chunk[-1].end189if end <= start:190continue191cursor = start192parts: list[str] = []193for word in chunk:194gap_cs = max(0, int(round((word.start - cursor) * 100)))195dur_cs = max(1, int(round((word.end - word.start) * 100)))196parts.append(f"{{\\kf{gap_cs + dur_cs}}}{escape_ass_text(word.text)} ")197cursor = word.end198line = "".join(parts).strip()199events.append(200f"Dialogue: 0,{ass_timestamp(start)},{ass_timestamp(end + 0.10)},Karaoke,,0,0,0,,{line}"201)202previous_end = end + 0.10203return header + "\n".join(events) + ("\n" if events else "")204205206def build_caption_ass(207words: tuple[WordToken, ...],208track: CaptionTrack,209style: dict,210width: int,211height: int,212) -> str:213style_payload = dict(style)214style_payload["max_words_per_cue"] = track.max_words_per_cue215if track.mode != "karaoke":216raise ValueError(f"Unsupported caption mode: {track.mode}")217return build_karaoke_ass(words, style_payload, width, height)218