Source from repo

Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

151

Skill

n/a

Size

940.9 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/check_training.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code187 linesFree

finetuning/scripts/check_training.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9check_training.py — Analyze training curves, detect overfitting, list checkpoints.
10 
11Usage:
12  python check_training.py --job-id ftjob-abc123
13  python check_training.py --job-id ftjob-abc123 --download-csv results.csv
14  python check_training.py --base-url https://<resource>.services.ai.azure.com/api/projects/<project>/openai/v1/ --api-key KEY --job-id ftjob-abc123
15"""
16 
17import csv
18import io
19import os
20import sys
21 
22try:
23    sys.stdout.reconfigure(encoding="utf-8")
24    sys.stderr.reconfigure(encoding="utf-8")
25except (AttributeError, OSError):
26    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
27sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
28from common import HelpOnErrorParser, get_clients
29 
30 
31def analyze_job(client, job_id, download_csv=None):
32    """Pull training results, analyze curves, detect overfitting."""
33    job = client.fine_tuning.jobs.retrieve(job_id)
34 
35    print(f"Job: {job.id}")
36    print(f"  Model: {job.model}")
37    print(f"  Status: {job.status}")
38    print(f"  Fine-tuned model: {job.fine_tuned_model}")
39 
40    if job.hyperparameters:
41        hp = job.hyperparameters
42        print(f"  Epochs: {getattr(hp, 'n_epochs', 'N/A')}")
43        print(f"  LR multiplier: {getattr(hp, 'learning_rate_multiplier', 'N/A')}")
44        print(f"  Batch size: {getattr(hp, 'batch_size', 'N/A')}")
45 
46    # Allow analysis while still running if result files exist
47    if job.status not in ("succeeded", "running"):
48        print(f"\n  Job status is '{job.status}'. Cannot analyze curves.")
49        return
50 
51    if not job.result_files:
52        if job.status == "running":
53            print("\n  Job is still running and no result files available yet. Check back later.")
54        else:
55            print("\n  No result files available.")
56        return
57 
58    # Download results CSV
59    content = client.files.content(job.result_files[0])
60    csv_data = content.read()
61 
62    if download_csv:
63        with open(download_csv, "wb") as f:
64            f.write(csv_data)
65        print(f"\n  Results CSV saved to {download_csv}")
66 
67    # Parse CSV
68    reader = csv.DictReader(io.StringIO(csv_data.decode("utf-8")))
69    rows = list(reader)
70 
71    if job.status == "running":
72        print(f"\n  ⚡ Job still running — showing partial results ({len(rows)} steps so far)")
73 
74    # Extract validation checkpoints
75    val_points = []
76    for row in rows:
77        step = int(row.get("step", 0))
78        train_loss = float(row["train_loss"]) if row.get("train_loss", "").strip() else None
79        val_loss = None
80        for col in ["valid_loss", "full_valid_loss", "eval_loss"]:
81            if row.get(col, "").strip():
82                val_loss = float(row[col])
83                break
84 
85        if val_loss is not None:
86            val_points.append((step, val_loss, train_loss))
87 
88    if not val_points:
89        print("\n  No validation loss data found in results CSV.")
90        return
91 
92    # Find best validation checkpoint
93    best_step, best_val, best_train = min(val_points, key=lambda x: x[1])
94    final_step, final_val, final_train = val_points[-1]
95 
96    print(f"\n  Training Curve Analysis:")
97    print(f"  {'Step':>6} {'Val Loss':>10} {'Train Loss':>12} {'Ratio':>8}")
98    print(f"  {'─'*6} {'─'*10} {'─'*12} {'─'*8}")
99    for step, val, train in val_points:
100        ratio = val / train if train and train > 0 else 0
101        marker = " ← best" if step == best_step else ""
102        train_str = f"{train:12.4f}" if train is not None else "         N/A"
103        print(f"  {step:>6} {val:>10.4f} {train_str} {ratio:>8.2f}{marker}")
104 
105    print(f"\n  Best val_loss: {best_val:.4f} at step {best_step}")
106    print(f"  Final val_loss: {final_val:.4f} at step {final_step}")
107 
108    # Overfitting detection
109    if best_val > 0 and final_val > best_val * 1.2:
110        pct = (final_val - best_val) / best_val * 100
111        print(f"\n  ⚠️  OVERFITTING DETECTED: Final val_loss is {pct:.0f}% above best.")
112    elif best_val == 0 and final_val > 0:
113        print(f"\n  ⚠️  Best val_loss was 0.0; final val_loss is {final_val:.4f} — possible overfitting from a near-perfect early checkpoint.")
114    elif final_train and final_val / final_train > 1.5:
115        ratio = final_val / final_train
116        print(f"\n  ⚠️  MODERATE OVERFITTING: val/train ratio = {ratio:.2f}")
117    else:
118        print(f"\n  ✅ Training looks healthy. No significant overfitting detected.")
119 
120    # List checkpoints and recommend best deployable one
121    print(f"\n  Checkpoints:")
122    available_checkpoints = []
123    try:
124        cps = client.fine_tuning.jobs.checkpoints.list(job_id)
125        if cps.data:
126            for cp in sorted(cps.data, key=lambda c: c.step_number):
127                vl = cp.metrics.valid_loss if cp.metrics and cp.metrics.valid_loss is not None else None
128                model_id = cp.fine_tuned_model_checkpoint or "N/A"
129                vl_str = f"{vl:.4f}" if vl is not None else "N/A"
130                available_checkpoints.append((cp.step_number, vl, model_id))
131                print(f"    Step {cp.step_number}: val_loss={vl_str}, model={model_id}")
132        else:
133            print("    No checkpoints available.")
134    except Exception as e:
135        print(f"    Could not retrieve checkpoints: {e}")
136 
137    # Recommend the best deployable checkpoint
138    if available_checkpoints and best_val > 0 and final_val > best_val * 1.2:
139        # Find the checkpoint with the lowest val_loss, or nearest to best_step
140        best_cp = None
141        if any(vl is not None for _, vl, _ in available_checkpoints):
142            # Use checkpoint with lowest val_loss
143            scored_cps = [(s, vl, m) for s, vl, m in available_checkpoints if vl is not None]
144            if scored_cps:
145                best_cp = min(scored_cps, key=lambda x: x[1])
146        else:
147            # No val_loss on checkpoints — pick the one nearest to (but not exceeding) best_step
148            earlier_cps = [(s, vl, m) for s, vl, m in available_checkpoints if s <= best_step]
149            if earlier_cps:
150                best_cp = max(earlier_cps, key=lambda x: x[0])
151            elif available_checkpoints:
152                best_cp = available_checkpoints[0]
153 
154        if best_cp:
155            cp_step, cp_vl, cp_model = best_cp
156            vl_info = f" (val_loss={cp_vl:.4f})" if cp_vl is not None else ""
157            print(f"\n  🎯 Recommended checkpoint: step {cp_step}{vl_info}")
158            print(f"     Model ID: {cp_model}")
159            print(f"     (Best val_loss was at step {best_step}, nearest deployable checkpoint is step {cp_step})")
160            print(f"     Alternatively, retrain with fewer epochs to avoid overfitting.")
161        else:
162            print(f"\n  Recommendation: Retrain with fewer epochs (best val_loss was at step {best_step}).")
163 
164 
165def main():
166    parser = HelpOnErrorParser(description="Analyze fine-tuning training curves")
167    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
168                        help="Project /v1/ URL (preferred)")
169    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
170                        help="Azure OpenAI endpoint (fallback)")
171    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
172                        help="Azure AI project endpoint (Foundry SDK)")
173    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
174    parser.add_argument("--job-id", required=True, help="Fine-tuning job ID")
175    parser.add_argument("--download-csv", help="Save results CSV to this path")
176    args = parser.parse_args()
177 
178    client, method = get_clients(
179        base_url=args.base_url, azure_endpoint=args.endpoint,
180        project_endpoint=args.project_endpoint, api_key=args.api_key
181    )
182    analyze_job(client, args.job_id, args.download_csv)
183 
184 
185if __name__ == "__main__":
186    main()
187

Loading source

Preparing the source view

Pulling the file list, source metadata, and syntax-aware rendering for this listing.

Marketplace

Source from repo

Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

151

Skill

n/a

Size

940.9 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/check_training.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code187 linesFree

finetuning/scripts/check_training.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9check_training.py — Analyze training curves, detect overfitting, list checkpoints.
10 
11Usage:
12  python check_training.py --job-id ftjob-abc123
13  python check_training.py --job-id ftjob-abc123 --download-csv results.csv
14  python check_training.py --base-url https://<resource>.services.ai.azure.com/api/projects/<project>/openai/v1/ --api-key KEY --job-id ftjob-abc123
15"""
16 
17import csv
18import io
19import os
20import sys
21 
22try:
23    sys.stdout.reconfigure(encoding="utf-8")
24    sys.stderr.reconfigure(encoding="utf-8")
25except (AttributeError, OSError):
26    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
27sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
28from common import HelpOnErrorParser, get_clients
29 
30 
31def analyze_job(client, job_id, download_csv=None):
32    """Pull training results, analyze curves, detect overfitting."""
33    job = client.fine_tuning.jobs.retrieve(job_id)
34 
35    print(f"Job: {job.id}")
36    print(f"  Model: {job.model}")
37    print(f"  Status: {job.status}")
38    print(f"  Fine-tuned model: {job.fine_tuned_model}")
39 
40    if job.hyperparameters:
41        hp = job.hyperparameters
42        print(f"  Epochs: {getattr(hp, 'n_epochs', 'N/A')}")
43        print(f"  LR multiplier: {getattr(hp, 'learning_rate_multiplier', 'N/A')}")
44        print(f"  Batch size: {getattr(hp, 'batch_size', 'N/A')}")
45 
46    # Allow analysis while still running if result files exist
47    if job.status not in ("succeeded", "running"):
48        print(f"\n  Job status is '{job.status}'. Cannot analyze curves.")
49        return
50 
51    if not job.result_files:
52        if job.status == "running":
53            print("\n  Job is still running and no result files available yet. Check back later.")
54        else:
55            print("\n  No result files available.")
56        return
57 
58    # Download results CSV
59    content = client.files.content(job.result_files[0])
60    csv_data = content.read()
61 
62    if download_csv:
63        with open(download_csv, "wb") as f:
64            f.write(csv_data)
65        print(f"\n  Results CSV saved to {download_csv}")
66 
67    # Parse CSV
68    reader = csv.DictReader(io.StringIO(csv_data.decode("utf-8")))
69    rows = list(reader)
70 
71    if job.status == "running":
72        print(f"\n  ⚡ Job still running — showing partial results ({len(rows)} steps so far)")
73 
74    # Extract validation checkpoints
75    val_points = []
76    for row in rows:
77        step = int(row.get("step", 0))
78        train_loss = float(row["train_loss"]) if row.get("train_loss", "").strip() else None
79        val_loss = None
80        for col in ["valid_loss", "full_valid_loss", "eval_loss"]:
81            if row.get(col, "").strip():
82                val_loss = float(row[col])
83                break
84 
85        if val_loss is not None:
86            val_points.append((step, val_loss, train_loss))
87 
88    if not val_points:
89        print("\n  No validation loss data found in results CSV.")
90        return
91 
92    # Find best validation checkpoint
93    best_step, best_val, best_train = min(val_points, key=lambda x: x[1])
94    final_step, final_val, final_train = val_points[-1]
95 
96    print(f"\n  Training Curve Analysis:")
97    print(f"  {'Step':>6} {'Val Loss':>10} {'Train Loss':>12} {'Ratio':>8}")
98    print(f"  {'─'*6} {'─'*10} {'─'*12} {'─'*8}")
99    for step, val, train in val_points:
100        ratio = val / train if train and train > 0 else 0
101        marker = " ← best" if step == best_step else ""
102        train_str = f"{train:12.4f}" if train is not None else "         N/A"
103        print(f"  {step:>6} {val:>10.4f} {train_str} {ratio:>8.2f}{marker}")
104 
105    print(f"\n  Best val_loss: {best_val:.4f} at step {best_step}")
106    print(f"  Final val_loss: {final_val:.4f} at step {final_step}")
107 
108    # Overfitting detection
109    if best_val > 0 and final_val > best_val * 1.2:
110        pct = (final_val - best_val) / best_val * 100
111        print(f"\n  ⚠️  OVERFITTING DETECTED: Final val_loss is {pct:.0f}% above best.")
112    elif best_val == 0 and final_val > 0:
113        print(f"\n  ⚠️  Best val_loss was 0.0; final val_loss is {final_val:.4f} — possible overfitting from a near-perfect early checkpoint.")
114    elif final_train and final_val / final_train > 1.5:
115        ratio = final_val / final_train
116        print(f"\n  ⚠️  MODERATE OVERFITTING: val/train ratio = {ratio:.2f}")
117    else:
118        print(f"\n  ✅ Training looks healthy. No significant overfitting detected.")
119 
120    # List checkpoints and recommend best deployable one
121    print(f"\n  Checkpoints:")
122    available_checkpoints = []
123    try:
124        cps = client.fine_tuning.jobs.checkpoints.list(job_id)
125        if cps.data:
126            for cp in sorted(cps.data, key=lambda c: c.step_number):
127                vl = cp.metrics.valid_loss if cp.metrics and cp.metrics.valid_loss is not None else None
128                model_id = cp.fine_tuned_model_checkpoint or "N/A"
129                vl_str = f"{vl:.4f}" if vl is not None else "N/A"
130                available_checkpoints.append((cp.step_number, vl, model_id))
131                print(f"    Step {cp.step_number}: val_loss={vl_str}, model={model_id}")
132        else:
133            print("    No checkpoints available.")
134    except Exception as e:
135        print(f"    Could not retrieve checkpoints: {e}")
136 
137    # Recommend the best deployable checkpoint
138    if available_checkpoints and best_val > 0 and final_val > best_val * 1.2:
139        # Find the checkpoint with the lowest val_loss, or nearest to best_step
140        best_cp = None
141        if any(vl is not None for _, vl, _ in available_checkpoints):
142            # Use checkpoint with lowest val_loss
143            scored_cps = [(s, vl, m) for s, vl, m in available_checkpoints if vl is not None]
144            if scored_cps:
145                best_cp = min(scored_cps, key=lambda x: x[1])
146        else:
147            # No val_loss on checkpoints — pick the one nearest to (but not exceeding) best_step
148            earlier_cps = [(s, vl, m) for s, vl, m in available_checkpoints if s <= best_step]
149            if earlier_cps:
150                best_cp = max(earlier_cps, key=lambda x: x[0])
151            elif available_checkpoints:
152                best_cp = available_checkpoints[0]
153 
154        if best_cp:
155            cp_step, cp_vl, cp_model = best_cp
156            vl_info = f" (val_loss={cp_vl:.4f})" if cp_vl is not None else ""
157            print(f"\n  🎯 Recommended checkpoint: step {cp_step}{vl_info}")
158            print(f"     Model ID: {cp_model}")
159            print(f"     (Best val_loss was at step {best_step}, nearest deployable checkpoint is step {cp_step})")
160            print(f"     Alternatively, retrain with fewer epochs to avoid overfitting.")
161        else:
162            print(f"\n  Recommendation: Retrain with fewer epochs (best val_loss was at step {best_step}).")
163 
164 
165def main():
166    parser = HelpOnErrorParser(description="Analyze fine-tuning training curves")
167    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
168                        help="Project /v1/ URL (preferred)")
169    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
170                        help="Azure OpenAI endpoint (fallback)")
171    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
172                        help="Azure AI project endpoint (Foundry SDK)")
173    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
174    parser.add_argument("--job-id", required=True, help="Fine-tuning job ID")
175    parser.add_argument("--download-csv", help="Save results CSV to this path")
176    args = parser.parse_args()
177 
178    client, method = get_clients(
179        base_url=args.base_url, azure_endpoint=args.endpoint,
180        project_endpoint=args.project_endpoint, api_key=args.api_key
181    )
182    analyze_job(client, args.job_id, args.download_csv)
183 
184 
185if __name__ == "__main__":
186    main()
187