Loading source
Pulling the file list, source metadata, and syntax-aware rendering for this listing.
Source from repo
Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
Files
Skill
Size
Entrypoint
Format
Open file
Syntax-highlighted preview of this file as included in the skill package.
finetuning/scripts/validate/data_stats.py
1#!/usr/bin/env python32"""Compute dataset statistics for any fine-tuning JSONL file.34Adapted from foundry-ft agent. Auto-detects SFT/DPO/RFT format and reports5token estimates, role distribution, and rough cost estimates.6"""7import json8import sys910try:11sys.stdout.reconfigure(encoding="utf-8")12sys.stderr.reconfigure(encoding="utf-8")13except (AttributeError, OSError):14pass # Stream not reconfigurable (older Python or non-tty); default encoding is fine15from collections import Counter161718def estimate_tokens(text: str) -> int:19"""Rough token estimate: ~4 chars per token for English text."""20return max(1, len(text) // 4)212223def extract_text(record: dict) -> str:24"""Extract all text content from a record regardless of format."""25texts = []26if "messages" in record:27for msg in record["messages"]:28if "content" in msg and msg["content"]:29texts.append(str(msg["content"]))30if "input" in record and "messages" in record["input"]:31for msg in record["input"]["messages"]:32if "content" in msg and msg["content"]:33texts.append(str(msg["content"]))34for field in ["preferred_output", "non_preferred_output"]:35if field in record:36for msg in record[field]:37if "content" in msg and msg["content"]:38texts.append(str(msg["content"]))39# Include any extra fields beyond messages/input/preferred_output/non_preferred_output40known_structural = {"messages", "input", "preferred_output", "non_preferred_output"}41for field in record:42if field not in known_structural and isinstance(record[field], (str, int, float)):43texts.append(str(record[field]))44return " ".join(texts)454647def data_stats(filepath: str) -> None:48records = []49format_type = "unknown"50parse_errors = 05152with open(filepath, "r", encoding="utf-8") as f:53for line in f:54line = line.strip()55if not line:56continue57try:58records.append(json.loads(line))59except json.JSONDecodeError:60parse_errors += 16162if not records:63print(f"No valid records found in {filepath}")64sys.exit(1)6566# Detect format67first = records[0]68if "input" in first and "preferred_output" in first:69format_type = "DPO"70elif "messages" in first:71msgs = first["messages"]72extra_fields = set(first.keys()) - {"messages"}73last_role = msgs[-1].get("role") if isinstance(msgs, list) and msgs else None74if extra_fields and last_role == "user":75format_type = "RFT"76else:77format_type = "SFT"7879# Compute stats80token_counts = [estimate_tokens(extract_text(r)) for r in records]81total_tokens = sum(token_counts)82avg_tokens = total_tokens / len(records)83min_tokens = min(token_counts)84max_tokens = max(token_counts)8586print(f"\n{'='*60}")87print(f"Dataset Statistics: {filepath}")88print(f"{'='*60}")89print(f"Format: {format_type}")90print(f"Total records: {len(records)}")91print(f"Parse errors: {parse_errors}")92print(f"")93print(f"Token Estimates (approx):")94print(f" Total: {total_tokens:,}")95print(f" Average/record: {avg_tokens:,.0f}")96print(f" Min: {min_tokens:,}")97print(f" Max: {max_tokens:,}")9899if format_type == "SFT":100role_counts = Counter()101for r in records:102for msg in r.get("messages", []):103role_counts[msg.get("role", "unknown")] += 1104print(f"\nRole Distribution:")105for role, count in role_counts.most_common():106print(f" {role}: {count}")107108has_system = sum(1 for r in records if any(m.get("role") == "system" for m in r.get("messages", [])))109print(f"\nRecords with system message: {has_system}/{len(records)}")110111elif format_type == "DPO":112pref_lens = []113non_pref_lens = []114for r in records:115pref_text = " ".join(m.get("content", "") for m in r.get("preferred_output", []))116non_pref_text = " ".join(m.get("content", "") for m in r.get("non_preferred_output", []))117pref_lens.append(estimate_tokens(pref_text))118non_pref_lens.append(estimate_tokens(non_pref_text))119print(f"\nPreferred output avg tokens: {sum(pref_lens)/len(pref_lens):,.0f}")120print(f"Non-preferred output avg tokens: {sum(non_pref_lens)/len(non_pref_lens):,.0f}")121122elif format_type == "RFT":123grader_field_counts = Counter()124grader_values = []125for r in records:126extra = set(r.keys()) - {"messages"}127grader_field_counts.update(extra)128for field in sorted(extra):129grader_values.append(str(r[field]))130unique = len(set(grader_values))131avg_val_len = sum(len(v) for v in grader_values) / len(grader_values) if grader_values else 0132print(f"\nGrader fields found:")133for field, count in grader_field_counts.most_common():134print(f" โข '{field}' โ in {count}/{len(records)} records")135print(f"Unique grader values: {unique}/{len(grader_values)}")136print(f"Avg grader value length: {avg_val_len:.0f} chars")137138# Dataset size guidance139print(f"\n๐ Dataset size guidance:")140if len(records) < 50:141print(f" โ ๏ธ Very small dataset ({len(records)} records). May only learn format, not domain knowledge.")142elif len(records) < 200:143print(f" โ ๏ธ Small dataset. Good for initial experiments โ evaluate results and add more data if needed.")144elif len(records) <= 500:145print(f" โ Sweet spot for getting started (200-500). Evaluate results to decide if you need more.")146elif len(records) <= 2000:147print(f" โ Good dataset size. Watch for diminishing returns โ check if quality beats quantity.")148else:149print(f" โ ๏ธ Large dataset ({len(records):,}). Larger isn't always better โ especially for OSS models where 335-500 examples outperformed 4K.")150151152if __name__ == "__main__":153if len(sys.argv) != 2:154print("Usage: python data_stats.py <path-to-jsonl>")155sys.exit(1)156data_stats(sys.argv[1])157