Source from bundle

Video Timeline Editor

Build or revise a reusable FFmpeg timeline for short-form video editing. Use when the user wants an agent-editable API for trimming clips, cropping, fitting to

Костянтин@Latand

Files

Skill

0.9K

Size

42.8 KB

Entrypoint

SKILL.md

Format

folder

Open file

scripts/video_timeline_editor/domain/services.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code218 linesFree

scripts/video_timeline_editor/domain/services.py

1from __future__ import annotations
2 
3from .model import CaptionTrack, Clip, ClipTiming, CropBox, MediaInfo, WordToken
4 
5 
6DEFAULT_OVERLAY_STYLES = {
7    "default": {
8        "fontsize": 42,
9        "fontcolor": "white",
10        "borderw": 2,
11        "bordercolor": "black",
12        "x": "(w-text_w)/2",
13        "y": "h-180",
14    },
15    "header": {
16        "fontsize": 42,
17        "fontcolor": "white",
18        "box": 1,
19        "boxcolor": "[email protected]",
20        "boxborderw": 16,
21        "x": "(w-text_w)/2",
22        "y": "120",
23    },
24}
25 
26DEFAULT_CAPTION_STYLES = {
27    "karaoke": {
28        "fontname": "DejaVu Sans",
29        "fontsize": 52,
30        "primary_color": "&H00FFFFFF",
31        "secondary_color": "&H000095FF",
32        "outline_color": "&H00000000",
33        "back_color": "&H96000000",
34        "bold": -1,
35        "italic": 0,
36        "underline": 0,
37        "strikeout": 0,
38        "scale_x": 100,
39        "scale_y": 100,
40        "spacing": 0,
41        "angle": 0,
42        "border_style": 1,
43        "outline": 3,
44        "shadow": 1,
45        "alignment": 2,
46        "margin_l": 10,
47        "margin_r": 10,
48        "margin_v": 350,
49    }
50}
51 
52 
53def merge_styles(base_styles: dict[str, dict], extra_styles: dict[str, dict] | None) -> dict[str, dict]:
54    merged = {name: dict(values) for name, values in base_styles.items()}
55    for name, values in (extra_styles or {}).items():
56        current = dict(merged.get(name, {}))
57        current.update(values)
58        merged[name] = current
59    return merged
60 
61 
62def validate_crop(crop: CropBox) -> None:
63    for key in ("x", "y", "width", "height"):
64        value = float(getattr(crop, key))
65        if value < 0 or value > 1:
66            raise ValueError(f"crop_box {key} must be between 0 and 1, got {value}")
67    if crop.x + crop.width > 1.0 or crop.y + crop.height > 1.0:
68        raise ValueError(f"crop_box exceeds source frame: {crop}")
69 
70 
71def compute_clip_timing(clip: Clip, media: MediaInfo, transcript_words: tuple[WordToken, ...] = ()) -> ClipTiming:
72    nominal_in = 0.0 if clip.src_in is None else float(clip.src_in)
73    nominal_out = media.duration if clip.src_out is None else float(clip.src_out)
74    nominal_in += float(clip.trim_start or 0.0)
75    nominal_out -= float(clip.trim_end or 0.0)
76    if nominal_out <= nominal_in:
77        raise ValueError(f"{clip.id}: clip end must be after clip start")
78    if clip.speed <= 0:
79        raise ValueError(f"{clip.id}: speed must be positive")
80    if clip.transform.audio_tempo <= 0 or clip.transform.video_pts <= 0:
81        raise ValueError(f"{clip.id}: transform tempo values must be positive")
82 
83    actual_in = nominal_in
84    actual_out = nominal_out
85    word_aligned = False
86    if clip.captions and clip.captions.trim == "word" and transcript_words:
87        snapped = snap_bounds_to_words(nominal_in, nominal_out, transcript_words)
88        if snapped is not None:
89            actual_in, actual_out = snapped
90            word_aligned = True
91 
92    duration_in_source = actual_out - actual_in
93    video_time_scale = clip.transform.video_pts / clip.speed
94    audio_time_scale = 1.0 / (clip.transform.audio_tempo * clip.speed)
95    output_duration = duration_in_source * video_time_scale
96    return ClipTiming(
97        nominal_in=nominal_in,
98        nominal_out=nominal_out,
99        actual_in=actual_in,
100        actual_out=actual_out,
101        video_time_scale=video_time_scale,
102        audio_time_scale=audio_time_scale,
103        output_duration=output_duration,
104        word_aligned=word_aligned,
105    )
106 
107 
108def snap_bounds_to_words(start: float, end: float, words: tuple[WordToken, ...]) -> tuple[float, float] | None:
109    start_candidates = [word.start for word in words if word.start >= start]
110    end_candidates = [word.end for word in words if word.end <= end]
111    if not start_candidates or not end_candidates:
112        return None
113    snapped_start = start_candidates[0]
114    snapped_end = end_candidates[-1]
115    if snapped_end <= snapped_start:
116        return None
117    return snapped_start, snapped_end
118 
119 
120def retime_caption_words(words: tuple[WordToken, ...], timing: ClipTiming) -> tuple[WordToken, ...]:
121    clipped: list[WordToken] = []
122    for word in words:
123        if word.end <= timing.actual_in or word.start >= timing.actual_out:
124            continue
125        start = max(word.start, timing.actual_in)
126        end = min(word.end, timing.actual_out)
127        shifted_start = max(0.0, (start - timing.actual_in) * timing.video_time_scale)
128        shifted_end = max(shifted_start + 0.01, (end - timing.actual_in) * timing.video_time_scale)
129        clipped.append(WordToken(text=word.text, start=shifted_start, end=shifted_end))
130    return tuple(clipped)
131 
132 
133def ass_timestamp(seconds: float) -> str:
134    hours = int(seconds // 3600)
135    minutes = int((seconds % 3600) // 60)
136    whole_seconds = int(seconds % 60)
137    centiseconds = int(round((seconds - int(seconds)) * 100))
138    if centiseconds == 100:
139        whole_seconds += 1
140        centiseconds = 0
141    return f"{hours}:{minutes:02d}:{whole_seconds:02d}.{centiseconds:02d}"
142 
143 
144def escape_ass_text(text: str) -> str:
145    return text.replace("\\", r"\\").replace("{", r"\{").replace("}", r"\}")
146 
147 
148def segment_words(words: tuple[WordToken, ...], max_words_per_cue: int) -> list[list[WordToken]]:
149    chunks: list[list[WordToken]] = []
150    chunk: list[WordToken] = []
151    for word in words:
152        chunk.append(word)
153        if len(chunk) >= max_words_per_cue or word.text.endswith((".", "!", "?")):
154            chunks.append(chunk)
155            chunk = []
156    if chunk:
157        chunks.append(chunk)
158    return chunks
159 
160 
161def build_karaoke_ass(words: tuple[WordToken, ...], style: dict, width: int, height: int) -> str:
162    style_line = (
163        "Style: Karaoke,"
164        f"{style['fontname']},{style['fontsize']},{style['primary_color']},{style['secondary_color']},"
165        f"{style['outline_color']},{style['back_color']},{style['bold']},{style['italic']},"
166        f"{style['underline']},{style['strikeout']},{style['scale_x']},{style['scale_y']},"
167        f"{style['spacing']},{style['angle']},{style['border_style']},{style['outline']},"
168        f"{style['shadow']},{style['alignment']},{style['margin_l']},{style['margin_r']},{style['margin_v']}"
169    )
170    header = (
171        "[Script Info]\n"
172        "ScriptType: v4.00+\n"
173        f"PlayResX: {width}\n"
174        f"PlayResY: {height}\n"
175        "WrapStyle: 0\n\n"
176        "[V4+ Styles]\n"
177        f"{style_line}\n\n"
178        "[Events]\n"
179        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
180    )
181    if not words:
182        return header
183 
184    events: list[str] = []
185    previous_end = 0.0
186    for chunk in segment_words(words, max(1, int(style.get("max_words_per_cue", 4)))):
187        start = max(chunk[0].start, previous_end)
188        end = chunk[-1].end
189        if end <= start:
190            continue
191        cursor = start
192        parts: list[str] = []
193        for word in chunk:
194            gap_cs = max(0, int(round((word.start - cursor) * 100)))
195            dur_cs = max(1, int(round((word.end - word.start) * 100)))
196            parts.append(f"{{\\kf{gap_cs + dur_cs}}}{escape_ass_text(word.text)} ")
197            cursor = word.end
198        line = "".join(parts).strip()
199        events.append(
200            f"Dialogue: 0,{ass_timestamp(start)},{ass_timestamp(end + 0.10)},Karaoke,,0,0,0,,{line}"
201        )
202        previous_end = end + 0.10
203    return header + "\n".join(events) + ("\n" if events else "")
204 
205 
206def build_caption_ass(
207    words: tuple[WordToken, ...],
208    track: CaptionTrack,
209    style: dict,
210    width: int,
211    height: int,
212) -> str:
213    style_payload = dict(style)
214    style_payload["max_words_per_cue"] = track.max_words_per_cue
215    if track.mode != "karaoke":
216        raise ValueError(f"Unsupported caption mode: {track.mode}")
217    return build_karaoke_ass(words, style_payload, width, height)
218

Marketplace

Source from bundle

Video Timeline Editor

Build or revise a reusable FFmpeg timeline for short-form video editing. Use when the user wants an agent-editable API for trimming clips, cropping, fitting to

Костянтин@Latand

Files

Skill

0.9K

Size

42.8 KB

Entrypoint

SKILL.md

Format

folder

Open file

scripts/video_timeline_editor/domain/services.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code218 linesFree

scripts/video_timeline_editor/domain/services.py

1from __future__ import annotations
2 
3from .model import CaptionTrack, Clip, ClipTiming, CropBox, MediaInfo, WordToken
4 
5 
6DEFAULT_OVERLAY_STYLES = {
7    "default": {
8        "fontsize": 42,
9        "fontcolor": "white",
10        "borderw": 2,
11        "bordercolor": "black",
12        "x": "(w-text_w)/2",
13        "y": "h-180",
14    },
15    "header": {
16        "fontsize": 42,
17        "fontcolor": "white",
18        "box": 1,
19        "boxcolor": "[email protected]",
20        "boxborderw": 16,
21        "x": "(w-text_w)/2",
22        "y": "120",
23    },
24}
25 
26DEFAULT_CAPTION_STYLES = {
27    "karaoke": {
28        "fontname": "DejaVu Sans",
29        "fontsize": 52,
30        "primary_color": "&H00FFFFFF",
31        "secondary_color": "&H000095FF",
32        "outline_color": "&H00000000",
33        "back_color": "&H96000000",
34        "bold": -1,
35        "italic": 0,
36        "underline": 0,
37        "strikeout": 0,
38        "scale_x": 100,
39        "scale_y": 100,
40        "spacing": 0,
41        "angle": 0,
42        "border_style": 1,
43        "outline": 3,
44        "shadow": 1,
45        "alignment": 2,
46        "margin_l": 10,
47        "margin_r": 10,
48        "margin_v": 350,
49    }
50}
51 
52 
53def merge_styles(base_styles: dict[str, dict], extra_styles: dict[str, dict] | None) -> dict[str, dict]:
54    merged = {name: dict(values) for name, values in base_styles.items()}
55    for name, values in (extra_styles or {}).items():
56        current = dict(merged.get(name, {}))
57        current.update(values)
58        merged[name] = current
59    return merged
60 
61 
62def validate_crop(crop: CropBox) -> None:
63    for key in ("x", "y", "width", "height"):
64        value = float(getattr(crop, key))
65        if value < 0 or value > 1:
66            raise ValueError(f"crop_box {key} must be between 0 and 1, got {value}")
67    if crop.x + crop.width > 1.0 or crop.y + crop.height > 1.0:
68        raise ValueError(f"crop_box exceeds source frame: {crop}")
69 
70 
71def compute_clip_timing(clip: Clip, media: MediaInfo, transcript_words: tuple[WordToken, ...] = ()) -> ClipTiming:
72    nominal_in = 0.0 if clip.src_in is None else float(clip.src_in)
73    nominal_out = media.duration if clip.src_out is None else float(clip.src_out)
74    nominal_in += float(clip.trim_start or 0.0)
75    nominal_out -= float(clip.trim_end or 0.0)
76    if nominal_out <= nominal_in:
77        raise ValueError(f"{clip.id}: clip end must be after clip start")
78    if clip.speed <= 0:
79        raise ValueError(f"{clip.id}: speed must be positive")
80    if clip.transform.audio_tempo <= 0 or clip.transform.video_pts <= 0:
81        raise ValueError(f"{clip.id}: transform tempo values must be positive")
82 
83    actual_in = nominal_in
84    actual_out = nominal_out
85    word_aligned = False
86    if clip.captions and clip.captions.trim == "word" and transcript_words:
87        snapped = snap_bounds_to_words(nominal_in, nominal_out, transcript_words)
88        if snapped is not None:
89            actual_in, actual_out = snapped
90            word_aligned = True
91 
92    duration_in_source = actual_out - actual_in
93    video_time_scale = clip.transform.video_pts / clip.speed
94    audio_time_scale = 1.0 / (clip.transform.audio_tempo * clip.speed)
95    output_duration = duration_in_source * video_time_scale
96    return ClipTiming(
97        nominal_in=nominal_in,
98        nominal_out=nominal_out,
99        actual_in=actual_in,
100        actual_out=actual_out,
101        video_time_scale=video_time_scale,
102        audio_time_scale=audio_time_scale,
103        output_duration=output_duration,
104        word_aligned=word_aligned,
105    )
106 
107 
108def snap_bounds_to_words(start: float, end: float, words: tuple[WordToken, ...]) -> tuple[float, float] | None:
109    start_candidates = [word.start for word in words if word.start >= start]
110    end_candidates = [word.end for word in words if word.end <= end]
111    if not start_candidates or not end_candidates:
112        return None
113    snapped_start = start_candidates[0]
114    snapped_end = end_candidates[-1]
115    if snapped_end <= snapped_start:
116        return None
117    return snapped_start, snapped_end
118 
119 
120def retime_caption_words(words: tuple[WordToken, ...], timing: ClipTiming) -> tuple[WordToken, ...]:
121    clipped: list[WordToken] = []
122    for word in words:
123        if word.end <= timing.actual_in or word.start >= timing.actual_out:
124            continue
125        start = max(word.start, timing.actual_in)
126        end = min(word.end, timing.actual_out)
127        shifted_start = max(0.0, (start - timing.actual_in) * timing.video_time_scale)
128        shifted_end = max(shifted_start + 0.01, (end - timing.actual_in) * timing.video_time_scale)
129        clipped.append(WordToken(text=word.text, start=shifted_start, end=shifted_end))
130    return tuple(clipped)
131 
132 
133def ass_timestamp(seconds: float) -> str:
134    hours = int(seconds // 3600)
135    minutes = int((seconds % 3600) // 60)
136    whole_seconds = int(seconds % 60)
137    centiseconds = int(round((seconds - int(seconds)) * 100))
138    if centiseconds == 100:
139        whole_seconds += 1
140        centiseconds = 0
141    return f"{hours}:{minutes:02d}:{whole_seconds:02d}.{centiseconds:02d}"
142 
143 
144def escape_ass_text(text: str) -> str:
145    return text.replace("\\", r"\\").replace("{", r"\{").replace("}", r"\}")
146 
147 
148def segment_words(words: tuple[WordToken, ...], max_words_per_cue: int) -> list[list[WordToken]]:
149    chunks: list[list[WordToken]] = []
150    chunk: list[WordToken] = []
151    for word in words:
152        chunk.append(word)
153        if len(chunk) >= max_words_per_cue or word.text.endswith((".", "!", "?")):
154            chunks.append(chunk)
155            chunk = []
156    if chunk:
157        chunks.append(chunk)
158    return chunks
159 
160 
161def build_karaoke_ass(words: tuple[WordToken, ...], style: dict, width: int, height: int) -> str:
162    style_line = (
163        "Style: Karaoke,"
164        f"{style['fontname']},{style['fontsize']},{style['primary_color']},{style['secondary_color']},"
165        f"{style['outline_color']},{style['back_color']},{style['bold']},{style['italic']},"
166        f"{style['underline']},{style['strikeout']},{style['scale_x']},{style['scale_y']},"
167        f"{style['spacing']},{style['angle']},{style['border_style']},{style['outline']},"
168        f"{style['shadow']},{style['alignment']},{style['margin_l']},{style['margin_r']},{style['margin_v']}"
169    )
170    header = (
171        "[Script Info]\n"
172        "ScriptType: v4.00+\n"
173        f"PlayResX: {width}\n"
174        f"PlayResY: {height}\n"
175        "WrapStyle: 0\n\n"
176        "[V4+ Styles]\n"
177        f"{style_line}\n\n"
178        "[Events]\n"
179        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
180    )
181    if not words:
182        return header
183 
184    events: list[str] = []
185    previous_end = 0.0
186    for chunk in segment_words(words, max(1, int(style.get("max_words_per_cue", 4)))):
187        start = max(chunk[0].start, previous_end)
188        end = chunk[-1].end
189        if end <= start:
190            continue
191        cursor = start
192        parts: list[str] = []
193        for word in chunk:
194            gap_cs = max(0, int(round((word.start - cursor) * 100)))
195            dur_cs = max(1, int(round((word.end - word.start) * 100)))
196            parts.append(f"{{\\kf{gap_cs + dur_cs}}}{escape_ass_text(word.text)} ")
197            cursor = word.end
198        line = "".join(parts).strip()
199        events.append(
200            f"Dialogue: 0,{ass_timestamp(start)},{ass_timestamp(end + 0.10)},Karaoke,,0,0,0,,{line}"
201        )
202        previous_end = end + 0.10
203    return header + "\n".join(events) + ("\n" if events else "")
204 
205 
206def build_caption_ass(
207    words: tuple[WordToken, ...],
208    track: CaptionTrack,
209    style: dict,
210    width: int,
211    height: int,
212) -> str:
213    style_payload = dict(style)
214    style_payload["max_words_per_cue"] = track.max_words_per_cue
215    if track.mode != "karaoke":
216        raise ValueError(f"Unsupported caption mode: {track.mode}")
217    return build_karaoke_ass(words, style_payload, width, height)
218

Video Timeline Editor

scripts/video_timeline_editor/domain/services.py

Preparing the source view

Video Timeline Editor

scripts/video_timeline_editor/domain/services.py