Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

155

Skill

n/a

Size

976.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/check_training.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code187 linesFree

finetuning/scripts/check_training.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9check_training.py — Analyze training curves, detect overfitting, list checkpoints.
10 
11Usage:
12  python check_training.py --job-id ftjob-abc123
13  python check_training.py --job-id ftjob-abc123 --download-csv results.csv
14  python check_training.py --base-url https://<resource>.services.ai.azure.com/api/projects/<project>/openai/v1/ --api-key KEY --job-id ftjob-abc123
15"""
16 
17import csv
18import io
19import os
20import sys
21 
22try:
23    sys.stdout.reconfigure(encoding="utf-8")
24    sys.stderr.reconfigure(encoding="utf-8")
25except (AttributeError, OSError):
26    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
27sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
28from common import HelpOnErrorParser, get_clients
29 
30 
31def analyze_job(client, job_id, download_csv=None):
32    """Pull training results, analyze curves, detect overfitting."""
33    job = client.fine_tuning.jobs.retrieve(job_id)
34 
35    print(f"Job: {job.id}")
36    print(f"  Model: {job.model}")
37    print(f"  Status: {job.status}")
38    print(f"  Fine-tuned model: {job.fine_tuned_model}")
39 
40    if job.hyperparameters:
41        hp = job.hyperparameters
42        print(f"  Epochs: {getattr(hp, 'n_epochs', 'N/A')}")
43        print(f"  LR multiplier: {getattr(hp, 'learning_rate_multiplier', 'N/A')}")
44        print(f"  Batch size: {getattr(hp, 'batch_size', 'N/A')}")
45 
46    # Allow analysis while still running if result files exist
47    if job.status not in ("succeeded", "running"):
48        print(f"\n  Job status is '{job.status}'. Cannot analyze curves.")
49        return
50 
51    if not job.result_files:
52        if job.status == "running":
53            print("\n  Job is still running and no result files available yet. Check back later.")
54        else:
55            print("\n  No result files available.")
56        return
57 
58    # Download results CSV
59    content = client.files.content(job.result_files[0])
60    csv_data = content.read()
61 
62    if download_csv:
63        with open(download_csv, "wb") as f:
64            f.write(csv_data)
65        print(f"\n  Results CSV saved to {download_csv}")
66 
67    # Parse CSV
68    reader = csv.DictReader(io.StringIO(csv_data.decode("utf-8")))
69    rows = list(reader)
70 
71    if job.status == "running":
72        print(f"\n  ⚡ Job still running — showing partial results ({len(rows)} steps so far)")
73 
74    # Extract validation checkpoints
75    val_points = []
76    for row in rows:
77        step = int(row.get("step", 0))
78        train_loss = float(row["train_loss"]) if row.get("train_loss", "").strip() else None
79        val_loss = None
80        for col in ["valid_loss", "full_valid_loss", "eval_loss"]:
81            if row.get(col, "").strip():
82                val_loss = float(row[col])
83                break
84 
85        if val_loss is not None:
86            val_points.append((step, val_loss, train_loss))
87 
88    if not val_points:
89        print("\n  No validation loss data found in results CSV.")
90        return
91 
92    # Find best validation checkpoint
93    best_step, best_val, best_train = min(val_points, key=lambda x: x[1])
94    final_step, final_val, final_train = val_points[-1]
95 
96    print(f"\n  Training Curve Analysis:")
97    print(f"  {'Step':>6} {'Val Loss':>10} {'Train Loss':>12} {'Ratio':>8}")
98    print(f"  {'─'*6} {'─'*10} {'─'*12} {'─'*8}")
99    for step, val, train in val_points:
100        ratio = val / train if train and train > 0 else 0
101        marker = " ← best" if step == best_step else ""
102        train_str = f"{train:12.4f}" if train is not None else "         N/A"
103        print(f"  {step:>6} {val:>10.4f} {train_str} {ratio:>8.2f}{marker}")
104 
105    print(f"\n  Best val_loss: {best_val:.4f} at step {best_step}")
106    print(f"  Final val_loss: {final_val:.4f} at step {final_step}")
107 
108    # Overfitting detection
109    if best_val > 0 and final_val > best_val * 1.2:
110        pct = (final_val - best_val) / best_val * 100
111        print(f"\n  ⚠️  OVERFITTING DETECTED: Final val_loss is {pct:.0f}% above best.")
112    elif best_val == 0 and final_val > 0:
113        print(f"\n  ⚠️  Best val_loss was 0.0; final val_loss is {final_val:.4f} — possible overfitting from a near-perfect early checkpoint.")
114    elif final_train and final_val / final_train > 1.5:
115        ratio = final_val / final_train
116        print(f"\n  ⚠️  MODERATE OVERFITTING: val/train ratio = {ratio:.2f}")
117    else:
118        print(f"\n  ✅ Training looks healthy. No significant overfitting detected.")
119 
120    # List checkpoints and recommend best deployable one
121    print(f"\n  Checkpoints:")
122    available_checkpoints = []
123    try:
124        cps = client.fine_tuning.jobs.checkpoints.list(job_id)
125        if cps.data:
126            for cp in sorted(cps.data, key=lambda c: c.step_number):
127                vl = cp.metrics.valid_loss if cp.metrics and cp.metrics.valid_loss is not None else None
128                model_id = cp.fine_tuned_model_checkpoint or "N/A"
129                vl_str = f"{vl:.4f}" if vl is not None else "N/A"
130                available_checkpoints.append((cp.step_number, vl, model_id))
131                print(f"    Step {cp.step_number}: val_loss={vl_str}, model={model_id}")
132        else:
133            print("    No checkpoints available.")
134    except Exception as e:
135        print(f"    Could not retrieve checkpoints: {e}")
136 
137    # Recommend the best deployable checkpoint
138    if available_checkpoints and best_val > 0 and final_val > best_val * 1.2:
139        # Find the checkpoint with the lowest val_loss, or nearest to best_step
140        best_cp = None
141        if any(vl is not None for _, vl, _ in available_checkpoints):
142            # Use checkpoint with lowest val_loss
143            scored_cps = [(s, vl, m) for s, vl, m in available_checkpoints if vl is not None]
144            if scored_cps:
145                best_cp = min(scored_cps, key=lambda x: x[1])
146        else:
147            # No val_loss on checkpoints — pick the one nearest to (but not exceeding) best_step
148            earlier_cps = [(s, vl, m) for s, vl, m in available_checkpoints if s <= best_step]
149            if earlier_cps:
150                best_cp = max(earlier_cps, key=lambda x: x[0])
151            elif available_checkpoints:
152                best_cp = available_checkpoints[0]
153 
154        if best_cp:
155            cp_step, cp_vl, cp_model = best_cp
156            vl_info = f" (val_loss={cp_vl:.4f})" if cp_vl is not None else ""
157            print(f"\n  🎯 Recommended checkpoint: step {cp_step}{vl_info}")
158            print(f"     Model ID: {cp_model}")
159            print(f"     (Best val_loss was at step {best_step}, nearest deployable checkpoint is step {cp_step})")
160            print(f"     Alternatively, retrain with fewer epochs to avoid overfitting.")
161        else:
162            print(f"\n  Recommendation: Retrain with fewer epochs (best val_loss was at step {best_step}).")
163 
164 
165def main():
166    parser = HelpOnErrorParser(description="Analyze fine-tuning training curves")
167    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
168                        help="Project /v1/ URL (preferred)")
169    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
170                        help="Azure OpenAI endpoint (fallback)")
171    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
172                        help="Azure AI project endpoint (Foundry SDK)")
173    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
174    parser.add_argument("--job-id", required=True, help="Fine-tuning job ID")
175    parser.add_argument("--download-csv", help="Save results CSV to this path")
176    args = parser.parse_args()
177 
178    client, method = get_clients(
179        base_url=args.base_url, azure_endpoint=args.endpoint,
180        project_endpoint=args.project_endpoint, api_key=args.api_key
181    )
182    analyze_job(client, args.job_id, args.download_csv)
183 
184 
185if __name__ == "__main__":
186    main()
187

Loading source

Preparing the source view

Pulling the file list, source metadata, and syntax-aware rendering for this listing.

Marketplace

Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

155

Skill

n/a

Size

976.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/check_training.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code187 linesFree

finetuning/scripts/check_training.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5#   "azure-ai-projects",
6# ]
7# ///
8"""
9check_training.py — Analyze training curves, detect overfitting, list checkpoints.
10 
11Usage:
12  python check_training.py --job-id ftjob-abc123
13  python check_training.py --job-id ftjob-abc123 --download-csv results.csv
14  python check_training.py --base-url https://<resource>.services.ai.azure.com/api/projects/<project>/openai/v1/ --api-key KEY --job-id ftjob-abc123
15"""
16 
17import csv
18import io
19import os
20import sys
21 
22try:
23    sys.stdout.reconfigure(encoding="utf-8")
24    sys.stderr.reconfigure(encoding="utf-8")
25except (AttributeError, OSError):
26    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
27sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
28from common import HelpOnErrorParser, get_clients
29 
30 
31def analyze_job(client, job_id, download_csv=None):
32    """Pull training results, analyze curves, detect overfitting."""
33    job = client.fine_tuning.jobs.retrieve(job_id)
34 
35    print(f"Job: {job.id}")
36    print(f"  Model: {job.model}")
37    print(f"  Status: {job.status}")
38    print(f"  Fine-tuned model: {job.fine_tuned_model}")
39 
40    if job.hyperparameters:
41        hp = job.hyperparameters
42        print(f"  Epochs: {getattr(hp, 'n_epochs', 'N/A')}")
43        print(f"  LR multiplier: {getattr(hp, 'learning_rate_multiplier', 'N/A')}")
44        print(f"  Batch size: {getattr(hp, 'batch_size', 'N/A')}")
45 
46    # Allow analysis while still running if result files exist
47    if job.status not in ("succeeded", "running"):
48        print(f"\n  Job status is '{job.status}'. Cannot analyze curves.")
49        return
50 
51    if not job.result_files:
52        if job.status == "running":
53            print("\n  Job is still running and no result files available yet. Check back later.")
54        else:
55            print("\n  No result files available.")
56        return
57 
58    # Download results CSV
59    content = client.files.content(job.result_files[0])
60    csv_data = content.read()
61 
62    if download_csv:
63        with open(download_csv, "wb") as f:
64            f.write(csv_data)
65        print(f"\n  Results CSV saved to {download_csv}")
66 
67    # Parse CSV
68    reader = csv.DictReader(io.StringIO(csv_data.decode("utf-8")))
69    rows = list(reader)
70 
71    if job.status == "running":
72        print(f"\n  ⚡ Job still running — showing partial results ({len(rows)} steps so far)")
73 
74    # Extract validation checkpoints
75    val_points = []
76    for row in rows:
77        step = int(row.get("step", 0))
78        train_loss = float(row["train_loss"]) if row.get("train_loss", "").strip() else None
79        val_loss = None
80        for col in ["valid_loss", "full_valid_loss", "eval_loss"]:
81            if row.get(col, "").strip():
82                val_loss = float(row[col])
83                break
84 
85        if val_loss is not None:
86            val_points.append((step, val_loss, train_loss))
87 
88    if not val_points:
89        print("\n  No validation loss data found in results CSV.")
90        return
91 
92    # Find best validation checkpoint
93    best_step, best_val, best_train = min(val_points, key=lambda x: x[1])
94    final_step, final_val, final_train = val_points[-1]
95 
96    print(f"\n  Training Curve Analysis:")
97    print(f"  {'Step':>6} {'Val Loss':>10} {'Train Loss':>12} {'Ratio':>8}")
98    print(f"  {'─'*6} {'─'*10} {'─'*12} {'─'*8}")
99    for step, val, train in val_points:
100        ratio = val / train if train and train > 0 else 0
101        marker = " ← best" if step == best_step else ""
102        train_str = f"{train:12.4f}" if train is not None else "         N/A"
103        print(f"  {step:>6} {val:>10.4f} {train_str} {ratio:>8.2f}{marker}")
104 
105    print(f"\n  Best val_loss: {best_val:.4f} at step {best_step}")
106    print(f"  Final val_loss: {final_val:.4f} at step {final_step}")
107 
108    # Overfitting detection
109    if best_val > 0 and final_val > best_val * 1.2:
110        pct = (final_val - best_val) / best_val * 100
111        print(f"\n  ⚠️  OVERFITTING DETECTED: Final val_loss is {pct:.0f}% above best.")
112    elif best_val == 0 and final_val > 0:
113        print(f"\n  ⚠️  Best val_loss was 0.0; final val_loss is {final_val:.4f} — possible overfitting from a near-perfect early checkpoint.")
114    elif final_train and final_val / final_train > 1.5:
115        ratio = final_val / final_train
116        print(f"\n  ⚠️  MODERATE OVERFITTING: val/train ratio = {ratio:.2f}")
117    else:
118        print(f"\n  ✅ Training looks healthy. No significant overfitting detected.")
119 
120    # List checkpoints and recommend best deployable one
121    print(f"\n  Checkpoints:")
122    available_checkpoints = []
123    try:
124        cps = client.fine_tuning.jobs.checkpoints.list(job_id)
125        if cps.data:
126            for cp in sorted(cps.data, key=lambda c: c.step_number):
127                vl = cp.metrics.valid_loss if cp.metrics and cp.metrics.valid_loss is not None else None
128                model_id = cp.fine_tuned_model_checkpoint or "N/A"
129                vl_str = f"{vl:.4f}" if vl is not None else "N/A"
130                available_checkpoints.append((cp.step_number, vl, model_id))
131                print(f"    Step {cp.step_number}: val_loss={vl_str}, model={model_id}")
132        else:
133            print("    No checkpoints available.")
134    except Exception as e:
135        print(f"    Could not retrieve checkpoints: {e}")
136 
137    # Recommend the best deployable checkpoint
138    if available_checkpoints and best_val > 0 and final_val > best_val * 1.2:
139        # Find the checkpoint with the lowest val_loss, or nearest to best_step
140        best_cp = None
141        if any(vl is not None for _, vl, _ in available_checkpoints):
142            # Use checkpoint with lowest val_loss
143            scored_cps = [(s, vl, m) for s, vl, m in available_checkpoints if vl is not None]
144            if scored_cps:
145                best_cp = min(scored_cps, key=lambda x: x[1])
146        else:
147            # No val_loss on checkpoints — pick the one nearest to (but not exceeding) best_step
148            earlier_cps = [(s, vl, m) for s, vl, m in available_checkpoints if s <= best_step]
149            if earlier_cps:
150                best_cp = max(earlier_cps, key=lambda x: x[0])
151            elif available_checkpoints:
152                best_cp = available_checkpoints[0]
153 
154        if best_cp:
155            cp_step, cp_vl, cp_model = best_cp
156            vl_info = f" (val_loss={cp_vl:.4f})" if cp_vl is not None else ""
157            print(f"\n  🎯 Recommended checkpoint: step {cp_step}{vl_info}")
158            print(f"     Model ID: {cp_model}")
159            print(f"     (Best val_loss was at step {best_step}, nearest deployable checkpoint is step {cp_step})")
160            print(f"     Alternatively, retrain with fewer epochs to avoid overfitting.")
161        else:
162            print(f"\n  Recommendation: Retrain with fewer epochs (best val_loss was at step {best_step}).")
163 
164 
165def main():
166    parser = HelpOnErrorParser(description="Analyze fine-tuning training curves")
167    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
168                        help="Project /v1/ URL (preferred)")
169    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
170                        help="Azure OpenAI endpoint (fallback)")
171    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
172                        help="Azure AI project endpoint (Foundry SDK)")
173    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
174    parser.add_argument("--job-id", required=True, help="Fine-tuning job ID")
175    parser.add_argument("--download-csv", help="Save results CSV to this path")
176    args = parser.parse_args()
177 
178    client, method = get_clients(
179        base_url=args.base_url, azure_endpoint=args.endpoint,
180        project_endpoint=args.project_endpoint, api_key=args.api_key
181    )
182    analyze_job(client, args.job_id, args.download_csv)
183 
184 
185if __name__ == "__main__":
186    main()
187