Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/book-sft-pipeline/scripts/pipeline_example.py
1"""2Book SFT Pipeline - Conceptual Implementation34This demonstrates the core patterns for building book-to-SFT pipelines.5Adapt to your specific LLM provider and training platform.6"""78from dataclasses import dataclass9from typing import List10import json1112# =============================================================================13# Data Structures14# =============================================================================1516@dataclass17class Chunk:18text: str19word_count: int20id: int2122@dataclass23class TrainingExample:24system: str25user: str26assistant: str2728def to_messages(self) -> dict:29return {30"messages": [31{"role": "system", "content": self.system},32{"role": "user", "content": self.user},33{"role": "assistant", "content": self.assistant}34]35}3637# =============================================================================38# Segmentation - The Core Algorithm39# =============================================================================4041def segment_text(text: str, min_words: int = 150, max_words: int = 400) -> List[Chunk]:42"""43Segment text into training-sized chunks with overlap.4445Key insight: Smaller chunks (150-400) produce more examples and better46style transfer than larger chunks (250-650).47"""48paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]49chunks = []50buffer = []51buffer_words = 05253for para in paragraphs:54para_words = len(para.split())5556if buffer_words + para_words > max_words and buffer_words >= min_words:57chunks.append(Chunk(58text='\n\n'.join(buffer),59word_count=buffer_words,60id=len(chunks)61))62# Keep last paragraph for overlap63buffer = [buffer[-1], para] if buffer else [para]64buffer_words = len(buffer[-2].split()) + para_words if len(buffer) > 1 else para_words65else:66buffer.append(para)67buffer_words += para_words6869if buffer and buffer_words >= min_words // 2:70chunks.append(Chunk(text='\n\n'.join(buffer), word_count=buffer_words, id=len(chunks)))7172return chunks7374# =============================================================================75# Diverse Prompt Generation - Prevents Memorization76# =============================================================================7778SYSTEM_PROMPTS = [79"You are an expert creative writer capable of emulating specific literary styles.",80"You are a literary writer with deep knowledge of classic prose styles.",81"You are a creative writer skilled at emulating distinctive authorial voices.",82]8384PROMPT_TEMPLATES = [85"Write a passage in the style of {author}: {desc}",86"Channel {author}'s voice to write about: {desc}",87"In {author}'s distinctive prose style, describe: {desc}",88"Write this scene as {author} would have: {desc}",89"Using {author}'s repetitive, rhythmic technique, write: {desc}",90]9192def build_examples(chunk: Chunk, instruction: str, author: str, variants: int = 2) -> List[TrainingExample]:93"""94Generate multiple training variants per chunk.9596Key insight: Diverse prompts prevent the model from memorizing97specific phrasings and force it to learn underlying style patterns.98"""99examples = []100for i in range(variants):101system = SYSTEM_PROMPTS[i % len(SYSTEM_PROMPTS)]102template = PROMPT_TEMPLATES[(chunk.id + i) % len(PROMPT_TEMPLATES)]103user = template.format(author=author, desc=instruction)104examples.append(TrainingExample(system=system, user=user, assistant=chunk.text))105return examples106107# =============================================================================108# Instruction Generation Prompt109# =============================================================================110111INSTRUCTION_PROMPT = """Describe what is happening in this excerpt in 2-3 sentences.112Focus on: characters present, actions, emotions, and setting.113Do NOT quote the text directly.114115Excerpt:116{text}117"""118119def generate_instruction(chunk: Chunk, llm_call) -> str:120"""121Generate a scene description for the chunk.122Replace llm_call with your actual LLM API.123"""124prompt = INSTRUCTION_PROMPT.format(text=chunk.text[:2000])125response = llm_call(prompt)126# Clean common prefixes127cleaned = response.strip()128for prefix in ["This excerpt", "The excerpt", "In this passage"]:129if cleaned.startswith(prefix):130cleaned = cleaned[len(prefix):].lstrip(", :")131return cleaned132133# =============================================================================134# Tinker Datum Construction135# =============================================================================136137def build_tinker_datum(example: dict, tokenizer, renderer):138"""139Convert training example to Tinker Datum format.140141Key insight: Weights of 0 for prompt, 1 for completion.142This teaches the model to generate completions, not repeat prompts.143"""144messages = example["messages"]145model_input, weights = renderer.build_supervised_example(messages)146147input_tokens = model_input.to_ints()148target_tokens = input_tokens[1:] # Shift for next-token prediction149weights = weights[1:] # Align weights150151return {152"model_input": input_tokens[:-1],153"loss_fn_inputs": {154"target_tokens": target_tokens,155"weights": weights156}157}158159# =============================================================================160# Validation Patterns161# =============================================================================162163def validate_style_transfer(output: str, training_data_path: str) -> dict:164"""165Validate that the model learned style, not just memorized content.166"""167# Check for exact phrase matches in training data168with open(training_data_path) as f:169training_text = f.read()170171# Split output into phrases and check for matches172phrases = [output[i:i+50] for i in range(0, len(output)-50, 25)]173exact_matches = sum(1 for p in phrases if p in training_text)174175return {176"originality_score": 1.0 - (exact_matches / max(len(phrases), 1)),177"exact_matches": exact_matches,178"is_original": exact_matches < 3179}180181MODERN_TEST_SCENARIOS = [182"Write about a barista making lattes",183"Describe two lovers communicating through text messages",184"Write about someone anxious about climate change",185]186# If model applies style to modern scenarios, it learned STYLE not CONTENT187188