Source from repo
Agent Skills for Context Engineering

A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
muratcankoylanGitHub muratcankoylanSource repo Original GitHub link
Files
241
Skill
n/a
Size
2.6 MB
Entrypoint
SKILL.md
Format
git-repo
Open file
examples/book-sft-pipeline/scripts/pipeline_example.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code188 linesFree
examples/book-sft-pipeline/scripts/pipeline_example.py
1"""
2Book SFT Pipeline - Conceptual Implementation
3 
4This demonstrates the core patterns for building book-to-SFT pipelines.
5Adapt to your specific LLM provider and training platform.
6"""
7 
8from dataclasses import dataclass
9from typing import List
10import json
11 
12# =============================================================================
13# Data Structures
14# =============================================================================
15 
16@dataclass
17class Chunk:
18    text: str
19    word_count: int
20    id: int
21 
22@dataclass 
23class TrainingExample:
24    system: str
25    user: str
26    assistant: str
27    
28    def to_messages(self) -> dict:
29        return {
30            "messages": [
31                {"role": "system", "content": self.system},
32                {"role": "user", "content": self.user},
33                {"role": "assistant", "content": self.assistant}
34            ]
35        }
36 
37# =============================================================================
38# Segmentation - The Core Algorithm
39# =============================================================================
40 
41def segment_text(text: str, min_words: int = 150, max_words: int = 400) -> List[Chunk]:
42    """
43    Segment text into training-sized chunks with overlap.
44    
45    Key insight: Smaller chunks (150-400) produce more examples and better
46    style transfer than larger chunks (250-650).
47    """
48    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
49    chunks = []
50    buffer = []
51    buffer_words = 0
52    
53    for para in paragraphs:
54        para_words = len(para.split())
55        
56        if buffer_words + para_words > max_words and buffer_words >= min_words:
57            chunks.append(Chunk(
58                text='\n\n'.join(buffer),
59                word_count=buffer_words,
60                id=len(chunks)
61            ))
62            # Keep last paragraph for overlap
63            buffer = [buffer[-1], para] if buffer else [para]
64            buffer_words = len(buffer[-2].split()) + para_words if len(buffer) > 1 else para_words
65        else:
66            buffer.append(para)
67            buffer_words += para_words
68    
69    if buffer and buffer_words >= min_words // 2:
70        chunks.append(Chunk(text='\n\n'.join(buffer), word_count=buffer_words, id=len(chunks)))
71    
72    return chunks
73 
74# =============================================================================
75# Diverse Prompt Generation - Prevents Memorization
76# =============================================================================
77 
78SYSTEM_PROMPTS = [
79    "You are an expert creative writer capable of emulating specific literary styles.",
80    "You are a literary writer with deep knowledge of classic prose styles.",
81    "You are a creative writer skilled at emulating distinctive authorial voices.",
82]
83 
84PROMPT_TEMPLATES = [
85    "Write a passage in the style of {author}: {desc}",
86    "Channel {author}'s voice to write about: {desc}",
87    "In {author}'s distinctive prose style, describe: {desc}",
88    "Write this scene as {author} would have: {desc}",
89    "Using {author}'s repetitive, rhythmic technique, write: {desc}",
90]
91 
92def build_examples(chunk: Chunk, instruction: str, author: str, variants: int = 2) -> List[TrainingExample]:
93    """
94    Generate multiple training variants per chunk.
95    
96    Key insight: Diverse prompts prevent the model from memorizing 
97    specific phrasings and force it to learn underlying style patterns.
98    """
99    examples = []
100    for i in range(variants):
101        system = SYSTEM_PROMPTS[i % len(SYSTEM_PROMPTS)]
102        template = PROMPT_TEMPLATES[(chunk.id + i) % len(PROMPT_TEMPLATES)]
103        user = template.format(author=author, desc=instruction)
104        examples.append(TrainingExample(system=system, user=user, assistant=chunk.text))
105    return examples
106 
107# =============================================================================
108# Instruction Generation Prompt
109# =============================================================================
110 
111INSTRUCTION_PROMPT = """Describe what is happening in this excerpt in 2-3 sentences.
112Focus on: characters present, actions, emotions, and setting.
113Do NOT quote the text directly.
114 
115Excerpt:
116{text}
117"""
118 
119def generate_instruction(chunk: Chunk, llm_call) -> str:
120    """
121    Generate a scene description for the chunk.
122    Replace llm_call with your actual LLM API.
123    """
124    prompt = INSTRUCTION_PROMPT.format(text=chunk.text[:2000])
125    response = llm_call(prompt)
126    # Clean common prefixes
127    cleaned = response.strip()
128    for prefix in ["This excerpt", "The excerpt", "In this passage"]:
129        if cleaned.startswith(prefix):
130            cleaned = cleaned[len(prefix):].lstrip(", :")
131    return cleaned
132 
133# =============================================================================
134# Tinker Datum Construction
135# =============================================================================
136 
137def build_tinker_datum(example: dict, tokenizer, renderer):
138    """
139    Convert training example to Tinker Datum format.
140    
141    Key insight: Weights of 0 for prompt, 1 for completion.
142    This teaches the model to generate completions, not repeat prompts.
143    """
144    messages = example["messages"]
145    model_input, weights = renderer.build_supervised_example(messages)
146    
147    input_tokens = model_input.to_ints()
148    target_tokens = input_tokens[1:]  # Shift for next-token prediction
149    weights = weights[1:]             # Align weights
150    
151    return {
152        "model_input": input_tokens[:-1],
153        "loss_fn_inputs": {
154            "target_tokens": target_tokens,
155            "weights": weights
156        }
157    }
158 
159# =============================================================================
160# Validation Patterns
161# =============================================================================
162 
163def validate_style_transfer(output: str, training_data_path: str) -> dict:
164    """
165    Validate that the model learned style, not just memorized content.
166    """
167    # Check for exact phrase matches in training data
168    with open(training_data_path) as f:
169        training_text = f.read()
170    
171    # Split output into phrases and check for matches
172    phrases = [output[i:i+50] for i in range(0, len(output)-50, 25)]
173    exact_matches = sum(1 for p in phrases if p in training_text)
174    
175    return {
176        "originality_score": 1.0 - (exact_matches / max(len(phrases), 1)),
177        "exact_matches": exact_matches,
178        "is_original": exact_matches < 3
179    }
180 
181MODERN_TEST_SCENARIOS = [
182    "Write about a barista making lattes",
183    "Describe two lovers communicating through text messages",
184    "Write about someone anxious about climate change",
185]
186# If model applies style to modern scenarios, it learned STYLE not CONTENT
187 
188
Preparing the source view

Agent Skills for Context Engineering

examples/book-sft-pipeline/scripts/pipeline_example.py