Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems.
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
examples/book-sft-pipeline/references/segmentation-strategies.md
1# Segmentation Strategies23Advanced patterns for splitting books into training chunks while preserving narrative coherence.45## The Segmentation Problem67Books present unique challenges for training data creation:891. **Variable paragraph length**: Some authors write single paragraphs spanning 1000+ words102. **Dialogue-heavy sections**: Short exchanges that individually are too small113. **Scene boundaries**: Natural break points that don't align with word counts124. **Stylistic variations**: Authors shift voice between narrative, dialogue, and exposition1314Poor segmentation teaches the model to produce:15- Incomplete thoughts16- Abrupt endings17- Incoherent transitions18- Fragmented style1920## Two-Tier Strategy2122### Tier 1: Paragraph-Based Accumulation2324The default approach for well-structured text:2526```python27class Tier1Segmenter:28def __init__(self, min_words: int = 250, max_words: int = 650):29self.min_words = min_words30self.max_words = max_words3132def segment(self, text: str) -> list[Chunk]:33paragraphs = self._split_paragraphs(text)34chunks = []35current = ChunkBuilder()3637for para in paragraphs:38word_count = len(para.split())3940# Check if single paragraph exceeds max41if word_count > self.max_words:42# Finalize current chunk if exists43if current.word_count > 0:44chunks.append(current.build())45current = ChunkBuilder()4647# Mark for Tier 2 processing48chunks.append(Chunk(49text=para,50requires_tier2=True,51word_count=word_count52))53continue5455# Would this paragraph overflow current chunk?56if current.word_count + word_count > self.max_words:57if current.word_count >= self.min_words:58chunks.append(current.build())59current = ChunkBuilder()6061current.add(para)6263# Don't forget the last chunk64if current.word_count > 0:65chunks.append(current.build())6667return chunks6869def _split_paragraphs(self, text: str) -> list[str]:70# Split on double newlines, preserve single newlines within71paragraphs = text.split('\n\n')72return [p.strip() for p in paragraphs if p.strip()]73```7475### Tier 2: LLM-Assisted Segmentation7677For oversized paragraphs that cannot be split at paragraph boundaries:7879```python80class Tier2Segmenter:81def __init__(self, model: str = "gpt-4o"):82self.model = model83self.prompt_template = self._load_prompt()8485async def segment(self, oversized_chunk: Chunk) -> list[Chunk]:86"""Split an oversized paragraph using LLM."""8788response = await self._call_llm(89self.prompt_template.format(text=oversized_chunk.text)90)9192segments = self._parse_segments(response)9394# Validate zero-deletion95original_words = len(oversized_chunk.text.split())96segmented_words = sum(len(s.split()) for s in segments)9798if abs(original_words - segmented_words) > 5: # Allow tiny variance99raise SegmentationError(100f"Word count mismatch: {original_words} -> {segmented_words}"101)102103return [104Chunk(text=s, requires_tier2=False, word_count=len(s.split()))105for s in segments106]107108def _load_prompt(self) -> str:109return """Segment this text into excerpts of minimum 300-350 words.110111Requirements:112- Each excerpt must be grammatically complete from start113- Each excerpt must not feel abruptly cut off114- Zero deletion - maintain original word count exactly115- Break at grammatically natural places:116* After complete dialogue exchanges117* At scene transitions118* After complete thoughts or descriptions119* Where a paragraph break would naturally occur120- Avoid breaking into too many small excerpts121- Start directly with the excerpts122- Separate excerpts with ===SEGMENT===123124Text to segment:125{text}126"""127128def _parse_segments(self, response: str) -> list[str]:129segments = response.split("===SEGMENT===")130return [s.strip() for s in segments if s.strip()]131```132133## Scene-Aware Segmentation134135For higher-quality results, detect scene boundaries:136137```python138class SceneAwareSegmenter:139"""Prefer breaking at scene boundaries when within word limits."""140141SCENE_MARKERS = [142r'\n\n\* \* \*\n\n', # Asterisk dividers143r'\n\n---\n\n', # Dash dividers144r'\n\n###\n\n', # Hash dividers145r'\n\nCHAPTER \d+', # Chapter headings146r'\n\n[A-Z]{3,}\n\n', # All-caps scene breaks147]148149def find_scene_breaks(self, text: str) -> list[int]:150"""Find character positions of scene breaks."""151breaks = []152for pattern in self.SCENE_MARKERS:153for match in re.finditer(pattern, text):154breaks.append(match.start())155return sorted(set(breaks))156157def segment_with_scenes(self, text: str) -> list[Chunk]:158scene_breaks = self.find_scene_breaks(text)159160# If scene breaks exist, prefer them over arbitrary paragraph breaks161if scene_breaks:162return self._segment_at_scenes(text, scene_breaks)163else:164return Tier1Segmenter().segment(text)165```166167## Dialogue Handling168169Dialogue-heavy sections require special handling:170171```python172class DialogueAwareSegmenter:173"""Group dialogue exchanges to maintain conversation coherence."""174175def is_dialogue_paragraph(self, para: str) -> bool:176"""Check if paragraph is primarily dialogue."""177# Count dialogue markers178quote_count = para.count('"') + para.count("'")179word_count = len(para.split())180181# If more than 20% of words are in quotes, it's dialogue-heavy182return quote_count > word_count * 0.2183184def segment(self, text: str) -> list[Chunk]:185paragraphs = text.split('\n\n')186chunks = []187current = ChunkBuilder()188in_dialogue_block = False189190for para in paragraphs:191is_dialogue = self.is_dialogue_paragraph(para)192193# Don't break in the middle of a dialogue exchange194if is_dialogue:195in_dialogue_block = True196current.add(para)197else:198if in_dialogue_block:199# End of dialogue block - good break point200in_dialogue_block = False201if current.word_count >= 250:202chunks.append(current.build())203current = ChunkBuilder()204205current.add(para)206207# Check if we've exceeded max208if current.word_count > 650:209chunks.append(current.build())210current = ChunkBuilder()211212if current.word_count > 0:213chunks.append(current.build())214215return chunks216```217218## Validation Pipeline219220Every segmentation result should pass validation:221222```python223class SegmentationValidator:224def validate(self, chunks: list[Chunk]) -> ValidationResult:225errors = []226warnings = []227228for i, chunk in enumerate(chunks):229# Check word count bounds230if chunk.word_count < 200:231warnings.append(f"Chunk {i}: Only {chunk.word_count} words")232if chunk.word_count > 700:233errors.append(f"Chunk {i}: {chunk.word_count} words exceeds max")234235# Check sentence completeness236if not self._ends_with_terminal(chunk.text):237errors.append(f"Chunk {i}: Ends mid-sentence")238239if not self._starts_grammatically(chunk.text):240errors.append(f"Chunk {i}: Starts mid-sentence")241242# Check for orphaned dialogue243if chunk.text.count('"') % 2 != 0:244warnings.append(f"Chunk {i}: Unbalanced quotes")245246return ValidationResult(247valid=len(errors) == 0,248errors=errors,249warnings=warnings250)251252def _ends_with_terminal(self, text: str) -> bool:253text = text.strip()254return text[-1] in '.!?"\'—'255256def _starts_grammatically(self, text: str) -> bool:257text = text.strip()258# Should start with capital or quote259return text[0].isupper() or text[0] in '"\'—'260```261262## Performance Considerations263264| Strategy | Speed | Quality | Use Case |265|----------|-------|---------|----------|266| Tier 1 only | Fast | Moderate | Well-structured prose |267| Tier 1 + Tier 2 | Moderate | High | Mixed paragraph lengths |268| Scene-aware | Fast | High | Novels with clear scene breaks |269| Dialogue-aware | Moderate | High | Dialogue-heavy fiction |270271## Edge Cases272273**1. Stream-of-consciousness writing**274- Single "paragraphs" spanning pages275- Solution: Force Tier 2 with explicit sentence boundary detection276277**2. Poetry or verse**278- Line breaks are semantic, not formatting279- Solution: Treat each stanza as atomic unit280281**3. Non-fiction with lists/bullets**282- Bullet points break paragraph detection283- Solution: Pre-process to convert bullets to prose284285**4. Multiple narrators**286- Voice shifts within chapters287- Solution: Detect narrator markers and prefer breaking there288289## Integration with Pipeline290291```python292class SegmentationAgent:293def __init__(self, config: SegmentationConfig):294self.tier1 = Tier1Segmenter(295min_words=config.min_words,296max_words=config.max_words297)298self.tier2 = Tier2Segmenter(model=config.tier2_model)299self.validator = SegmentationValidator()300301async def segment(self, text: str) -> list[Chunk]:302# Phase 1: Tier 1 segmentation303chunks = self.tier1.segment(text)304305# Phase 2: Process oversized chunks with Tier 2306final_chunks = []307for chunk in chunks:308if chunk.requires_tier2:309sub_chunks = await self.tier2.segment(chunk)310final_chunks.extend(sub_chunks)311else:312final_chunks.append(chunk)313314# Phase 3: Validate315result = self.validator.validate(final_chunks)316if not result.valid:317raise SegmentationError(result.errors)318319if result.warnings:320logger.warning(f"Segmentation warnings: {result.warnings}")321322return final_chunks323```324325