Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/validate/data_stats.py
1#!/usr/bin/env python32"""Compute dataset statistics for any fine-tuning JSONL file.34Adapted from foundry-ft agent. Auto-detects SFT/DPO/RFT format and reports5token estimates, role distribution, and rough cost estimates.6"""7import json8import sys910try:11sys.stdout.reconfigure(encoding="utf-8")12sys.stderr.reconfigure(encoding="utf-8")13except (AttributeError, OSError):14pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine15from collections import Counter161718def estimate_tokens(text: str) -> int:19"""Rough token estimate: ~4 chars per token for English text."""20return max(1, len(text) // 4)212223def extract_text(record: dict) -> str:24"""Extract all text content from a record regardless of format."""25texts = []26if "messages" in record:27for msg in record["messages"]:28if "content" in msg and msg["content"]:29texts.append(str(msg["content"]))30if "input" in record and "messages" in record["input"]:31for msg in record["input"]["messages"]:32if "content" in msg and msg["content"]:33texts.append(str(msg["content"]))34for field in ["preferred_output", "non_preferred_output"]:35if field in record:36for msg in record[field]:37if "content" in msg and msg["content"]:38texts.append(str(msg["content"]))39# Include any extra fields beyond messages/input/preferred_output/non_preferred_output40known_structural = {"messages", "input", "preferred_output", "non_preferred_output"}41for field in record:42if field not in known_structural and isinstance(record[field], (str, int, float)):43texts.append(str(record[field]))44return " ".join(texts)454647def data_stats(filepath: str) -> None:48records = []49format_type = "unknown"50parse_errors = 05152with open(filepath, "r", encoding="utf-8") as f:53for line in f:54line = line.strip()55if not line:56continue57try:58records.append(json.loads(line))59except json.JSONDecodeError:60parse_errors += 16162if not records:63print(f"No valid records found in {filepath}")64sys.exit(1)6566# Detect format67first = records[0]68if "input" in first and "preferred_output" in first:69format_type = "DPO"70elif "messages" in first:71msgs = first["messages"]72extra_fields = set(first.keys()) - {"messages"}73last_role = msgs[-1].get("role") if isinstance(msgs, list) and msgs else None74if extra_fields and last_role == "user":75format_type = "RFT"76else:77format_type = "SFT"7879# Compute stats80token_counts = [estimate_tokens(extract_text(r)) for r in records]81total_tokens = sum(token_counts)82avg_tokens = total_tokens / len(records)83min_tokens = min(token_counts)84max_tokens = max(token_counts)8586print(f"\n{'='*60}")87print(f"Dataset Statistics: {filepath}")88print(f"{'='*60}")89print(f"Format: {format_type}")90print(f"Total records: {len(records)}")91print(f"Parse errors: {parse_errors}")92print(f"")93print(f"Token Estimates (approx):")94print(f" Total: {total_tokens:,}")95print(f" Average/record: {avg_tokens:,.0f}")96print(f" Min: {min_tokens:,}")97print(f" Max: {max_tokens:,}")9899if format_type == "SFT":100role_counts = Counter()101for r in records:102for msg in r.get("messages", []):103role_counts[msg.get("role", "unknown")] += 1104print(f"\nRole Distribution:")105for role, count in role_counts.most_common():106print(f" {role}: {count}")107108has_system = sum(1 for r in records if any(m.get("role") == "system" for m in r.get("messages", [])))109print(f"\nRecords with system message: {has_system}/{len(records)}")110111elif format_type == "DPO":112pref_lens = []113non_pref_lens = []114for r in records:115pref_text = " ".join(m.get("content", "") for m in r.get("preferred_output", []))116non_pref_text = " ".join(m.get("content", "") for m in r.get("non_preferred_output", []))117pref_lens.append(estimate_tokens(pref_text))118non_pref_lens.append(estimate_tokens(non_pref_text))119print(f"\nPreferred output avg tokens: {sum(pref_lens)/len(pref_lens):,.0f}")120print(f"Non-preferred output avg tokens: {sum(non_pref_lens)/len(non_pref_lens):,.0f}")121122elif format_type == "RFT":123grader_field_counts = Counter()124grader_values = []125for r in records:126extra = set(r.keys()) - {"messages"}127grader_field_counts.update(extra)128for field in sorted(extra):129grader_values.append(str(r[field]))130unique = len(set(grader_values))131avg_val_len = sum(len(v) for v in grader_values) / len(grader_values) if grader_values else 0132print(f"\nGrader fields found:")133for field, count in grader_field_counts.most_common():134print(f" โข '{field}' โ in {count}/{len(records)} records")135print(f"Unique grader values: {unique}/{len(grader_values)}")136print(f"Avg grader value length: {avg_val_len:.0f} chars")137138# Dataset size guidance139print(f"\n๐ Dataset size guidance:")140if len(records) < 50:141print(f" โ ๏ธ Very small dataset ({len(records)} records). May only learn format, not domain knowledge.")142elif len(records) < 200:143print(f" โ ๏ธ Small dataset. Good for initial experiments โ evaluate results and add more data if needed.")144elif len(records) <= 500:145print(f" โ Sweet spot for getting started (200-500). Evaluate results to decide if you need more.")146elif len(records) <= 2000:147print(f" โ Good dataset size. Watch for diminishing returns โ check if quality beats quantity.")148else:149print(f" โ ๏ธ Large dataset ({len(records):,}). Larger isn't always better โ especially for OSS models where 335-500 examples outperformed 4K.")150151152if __name__ == "__main__":153if len(sys.argv) != 2:154print("Usage: python data_stats.py <path-to-jsonl>")155sys.exit(1)156data_stats(sys.argv[1])157