Source from repo
Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services
microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page
Files
155
Skill
n/a
Size
976.3 KB
Entrypoint
SKILL.md
Format
git-repo
Open file
finetuning/scripts/validate/data_stats.py

Syntax-highlighted preview of this file as included in the skill package.
Rendered Source
code157 linesFree
finetuning/scripts/validate/data_stats.py
1#!/usr/bin/env python3
2"""Compute dataset statistics for any fine-tuning JSONL file.
3 
4Adapted from foundry-ft agent. Auto-detects SFT/DPO/RFT format and reports
5token estimates, role distribution, and rough cost estimates.
6"""
7import json
8import sys
9 
10try:
11    sys.stdout.reconfigure(encoding="utf-8")
12    sys.stderr.reconfigure(encoding="utf-8")
13except (AttributeError, OSError):
14    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
15from collections import Counter
16 
17 
18def estimate_tokens(text: str) -> int:
19    """Rough token estimate: ~4 chars per token for English text."""
20    return max(1, len(text) // 4)
21 
22 
23def extract_text(record: dict) -> str:
24    """Extract all text content from a record regardless of format."""
25    texts = []
26    if "messages" in record:
27        for msg in record["messages"]:
28            if "content" in msg and msg["content"]:
29                texts.append(str(msg["content"]))
30    if "input" in record and "messages" in record["input"]:
31        for msg in record["input"]["messages"]:
32            if "content" in msg and msg["content"]:
33                texts.append(str(msg["content"]))
34    for field in ["preferred_output", "non_preferred_output"]:
35        if field in record:
36            for msg in record[field]:
37                if "content" in msg and msg["content"]:
38                    texts.append(str(msg["content"]))
39    # Include any extra fields beyond messages/input/preferred_output/non_preferred_output
40    known_structural = {"messages", "input", "preferred_output", "non_preferred_output"}
41    for field in record:
42        if field not in known_structural and isinstance(record[field], (str, int, float)):
43            texts.append(str(record[field]))
44    return " ".join(texts)
45 
46 
47def data_stats(filepath: str) -> None:
48    records = []
49    format_type = "unknown"
50    parse_errors = 0
51 
52    with open(filepath, "r", encoding="utf-8") as f:
53        for line in f:
54            line = line.strip()
55            if not line:
56                continue
57            try:
58                records.append(json.loads(line))
59            except json.JSONDecodeError:
60                parse_errors += 1
61 
62    if not records:
63        print(f"No valid records found in {filepath}")
64        sys.exit(1)
65 
66    # Detect format
67    first = records[0]
68    if "input" in first and "preferred_output" in first:
69        format_type = "DPO"
70    elif "messages" in first:
71        msgs = first["messages"]
72        extra_fields = set(first.keys()) - {"messages"}
73        last_role = msgs[-1].get("role") if isinstance(msgs, list) and msgs else None
74        if extra_fields and last_role == "user":
75            format_type = "RFT"
76        else:
77            format_type = "SFT"
78 
79    # Compute stats
80    token_counts = [estimate_tokens(extract_text(r)) for r in records]
81    total_tokens = sum(token_counts)
82    avg_tokens = total_tokens / len(records)
83    min_tokens = min(token_counts)
84    max_tokens = max(token_counts)
85 
86    print(f"\n{'='*60}")
87    print(f"Dataset Statistics: {filepath}")
88    print(f"{'='*60}")
89    print(f"Format:           {format_type}")
90    print(f"Total records:    {len(records)}")
91    print(f"Parse errors:     {parse_errors}")
92    print(f"")
93    print(f"Token Estimates (approx):")
94    print(f"  Total:          {total_tokens:,}")
95    print(f"  Average/record: {avg_tokens:,.0f}")
96    print(f"  Min:            {min_tokens:,}")
97    print(f"  Max:            {max_tokens:,}")
98 
99    if format_type == "SFT":
100        role_counts = Counter()
101        for r in records:
102            for msg in r.get("messages", []):
103                role_counts[msg.get("role", "unknown")] += 1
104        print(f"\nRole Distribution:")
105        for role, count in role_counts.most_common():
106            print(f"  {role}: {count}")
107 
108        has_system = sum(1 for r in records if any(m.get("role") == "system" for m in r.get("messages", [])))
109        print(f"\nRecords with system message: {has_system}/{len(records)}")
110 
111    elif format_type == "DPO":
112        pref_lens = []
113        non_pref_lens = []
114        for r in records:
115            pref_text = " ".join(m.get("content", "") for m in r.get("preferred_output", []))
116            non_pref_text = " ".join(m.get("content", "") for m in r.get("non_preferred_output", []))
117            pref_lens.append(estimate_tokens(pref_text))
118            non_pref_lens.append(estimate_tokens(non_pref_text))
119        print(f"\nPreferred output avg tokens:     {sum(pref_lens)/len(pref_lens):,.0f}")
120        print(f"Non-preferred output avg tokens: {sum(non_pref_lens)/len(non_pref_lens):,.0f}")
121 
122    elif format_type == "RFT":
123        grader_field_counts = Counter()
124        grader_values = []
125        for r in records:
126            extra = set(r.keys()) - {"messages"}
127            grader_field_counts.update(extra)
128            for field in sorted(extra):
129                grader_values.append(str(r[field]))
130        unique = len(set(grader_values))
131        avg_val_len = sum(len(v) for v in grader_values) / len(grader_values) if grader_values else 0
132        print(f"\nGrader fields found:")
133        for field, count in grader_field_counts.most_common():
134            print(f"  • '{field}' — in {count}/{len(records)} records")
135        print(f"Unique grader values: {unique}/{len(grader_values)}")
136        print(f"Avg grader value length: {avg_val_len:.0f} chars")
137 
138    # Dataset size guidance
139    print(f"\n📊 Dataset size guidance:")
140    if len(records) < 50:
141        print(f"  ⚠️ Very small dataset ({len(records)} records). May only learn format, not domain knowledge.")
142    elif len(records) < 200:
143        print(f"  ⚠️ Small dataset. Good for initial experiments — evaluate results and add more data if needed.")
144    elif len(records) <= 500:
145        print(f"  ✅ Sweet spot for getting started (200-500). Evaluate results to decide if you need more.")
146    elif len(records) <= 2000:
147        print(f"  ✅ Good dataset size. Watch for diminishing returns — check if quality beats quantity.")
148    else:
149        print(f"  ⚠️ Large dataset ({len(records):,}). Larger isn't always better — especially for OSS models where 335-500 examples outperformed 4K.")
150 
151 
152if __name__ == "__main__":
153    if len(sys.argv) != 2:
154        print("Usage: python data_stats.py <path-to-jsonl>")
155        sys.exit(1)
156    data_stats(sys.argv[1])
157
Preparing the source view

Microsoft Foundry Skill

finetuning/scripts/validate/data_stats.py