Source from repo

Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

154

Skill

n/a

Size

976.2 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/score_dataset.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code215 linesFree

finetuning/scripts/score_dataset.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8score_dataset.py — Assess training data quality using an LLM judge.
9 
10Scores each example on correctness and relevance, optionally filters
11out low-quality examples.
12 
13Usage:
14  # Score all examples
15  python score_dataset.py --input training.jsonl --output scored.jsonl
16 
17  # Score and filter (keep only score >= 7)
18  python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 7
19 
20  # Custom scoring dimensions
21  python score_dataset.py --input training.jsonl --output scored.jsonl \
22      --dimensions "correctness,clarity,completeness"
23"""
24 
25import json
26import os
27import re
28import sys
29 
30try:
31    sys.stdout.reconfigure(encoding="utf-8")
32    sys.stderr.reconfigure(encoding="utf-8")
33except (AttributeError, OSError):
34    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
35import time
36from concurrent.futures import ThreadPoolExecutor, as_completed
37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
38from common import HelpOnErrorParser, get_clients, _clamp_score
39 
40 
41QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.
42 
43## Task
44Evaluate this training example for quality.
45 
46## User input (what the model receives)
47{user_content}
48 
49## Assistant output (what the model should learn to produce)
50{assistant_content}
51 
52## Scoring dimensions
53{dimensions_text}
54 
55Rate each dimension on a scale of 1-10.
56 
57Return ONLY a JSON object with dimension names as keys and integer scores as values.
58Example: {example_json}"""
59 
60 
61DEFAULT_DIMENSIONS = {
62    "correctness": "Is the assistant's output factually/functionally correct?",
63    "relevance": "Does the output directly address the user's request?",
64    "quality": "Is the output well-written, well-formatted, and professional?",
65}
66 
67 
68def score_example(client, model, user_content, assistant_content, dimensions):
69    """Score a single training example."""
70    dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())
71    example = {k: 8 for k in dimensions}
72 
73    prompt = QUALITY_PROMPT.format(
74        user_content=user_content[:2000],
75        assistant_content=assistant_content[:2000],
76        dimensions_text=dims_text,
77        example_json=json.dumps(example),
78    )
79 
80    for attempt in range(3):
81        try:
82            resp = client.chat.completions.create(
83                model=model,
84                messages=[{"role": "user", "content": prompt}],
85                temperature=0.0,
86                max_completion_tokens=200,
87            )
88            text = (resp.choices[0].message.content or "").strip()
89            match = re.search(r'\{[^}]+\}', text)
90            if match:
91                scores = json.loads(match.group())
92                return {k: _clamp_score(scores.get(k)) for k in dimensions}
93        except Exception:
94            if attempt < 2:
95                time.sleep(2)
96 
97    return {k: 0 for k in dimensions}
98 
99 
100def main():
101    parser = HelpOnErrorParser(description="Score training data quality with LLM judge")
102    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
103                        help="Project /v1/ URL (preferred)")
104    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
105                        help="Azure OpenAI endpoint (fallback)")
106    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
107                        help="Azure AI project endpoint (Foundry SDK)")
108    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
109    parser.add_argument("--model", default="gpt-4o", help="Judge model")
110    parser.add_argument("--input", required=True, help="Input JSONL file")
111    parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")
112    parser.add_argument("--min-score", type=float, default=None,
113                        help="Minimum average score to keep (filters below this)")
114    parser.add_argument("--dimensions", default=None,
115                        help="Comma-separated dimension names (default: correctness,relevance,quality)")
116    parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")
117    parser.add_argument("--strip-metadata", action="store_true",
118                        help="Remove _quality_scores and _avg_quality from output (safe for training input)")
119    args = parser.parse_args()
120 
121    client, method = get_clients(
122        base_url=args.base_url, azure_endpoint=args.endpoint,
123        project_endpoint=args.project_endpoint, api_key=args.api_key
124    )
125 
126    # Parse dimensions
127    if args.dimensions:
128        dim_names = [d.strip() for d in args.dimensions.split(",")]
129        dimensions = {d: f"Rate the {d} of the output" for d in dim_names}
130    else:
131        dimensions = DEFAULT_DIMENSIONS
132 
133    # Load data
134    examples = []
135    with open(args.input, encoding="utf-8") as f:
136        for i, line in enumerate(f):
137            if not line.strip():
138                continue
139            try:
140                ex = json.loads(line)
141            except json.JSONDecodeError as e:
142                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
143                continue
144            msgs = ex.get("messages", [])
145            user = next((m["content"] for m in msgs if m["role"] == "user"), "")
146            asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
147            examples.append({"data": ex, "user": user, "assistant": asst})
148 
149    print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")
150 
151    # Score in parallel
152    def score_one(idx):
153        ex = examples[idx]
154        scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)
155        return idx, scores
156 
157    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
158        futures = {pool.submit(score_one, i): i for i in range(len(examples))}
159        done = 0
160        for future in as_completed(futures):
161            idx, scores = future.result()
162            examples[idx]["scores"] = scores
163            done += 1
164            if done % 25 == 0:
165                print(f"  Scored {done}/{len(examples)}")
166 
167    # Calculate stats
168    all_avgs = []
169    for ex in examples:
170        scores = ex.get("scores", {})
171        if scores and any(v > 0 for v in scores.values()):
172            avg = sum(scores.values()) / len(scores)
173            ex["avg_score"] = avg
174            all_avgs.append(avg)
175 
176    if all_avgs:
177        print(f"\nQuality Distribution:")
178        print(f"  Mean:   {sum(all_avgs)/len(all_avgs):.1f}")
179        print(f"  Min:    {min(all_avgs):.1f}")
180        print(f"  Max:    {max(all_avgs):.1f}")
181        sorted_avgs = sorted(all_avgs)
182        n_avgs = len(sorted_avgs)
183        if n_avgs % 2 == 1:
184            median = sorted_avgs[n_avgs // 2]
185        else:
186            median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2
187        print(f"  Median: {median:.1f}")
188 
189    # Filter and write
190    kept = 0
191    filtered = 0
192    with open(args.output, "w", encoding="utf-8") as f:
193        for ex in examples:
194            if not args.strip_metadata:
195                ex["data"]["_quality_scores"] = ex.get("scores", {})
196                ex["data"]["_avg_quality"] = ex.get("avg_score", 0)
197 
198            if args.min_score and ex.get("avg_score", 0) < args.min_score:
199                filtered += 1
200                continue
201 
202            f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")
203            kept += 1
204 
205    print(f"\nKept: {kept}, Filtered: {filtered}")
206    if args.min_score:
207        print(f"(min_score threshold: {args.min_score})")
208    if args.strip_metadata:
209        print("(metadata stripped — output is safe for training input)")
210    print(f"Output: {args.output}")
211 
212 
213if __name__ == "__main__":
214    main()
215

Loading source

Preparing the source view

Pulling the file list, source metadata, and syntax-aware rendering for this listing.

Marketplace

Source from repo

Microsoft Foundry Skill

Deploy, evaluate, and manage AI agents end-to-end on Microsoft Azure AI Foundry

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

154

Skill

n/a

Size

976.2 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/score_dataset.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code215 linesFree

finetuning/scripts/score_dataset.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8score_dataset.py — Assess training data quality using an LLM judge.
9 
10Scores each example on correctness and relevance, optionally filters
11out low-quality examples.
12 
13Usage:
14  # Score all examples
15  python score_dataset.py --input training.jsonl --output scored.jsonl
16 
17  # Score and filter (keep only score >= 7)
18  python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 7
19 
20  # Custom scoring dimensions
21  python score_dataset.py --input training.jsonl --output scored.jsonl \
22      --dimensions "correctness,clarity,completeness"
23"""
24 
25import json
26import os
27import re
28import sys
29 
30try:
31    sys.stdout.reconfigure(encoding="utf-8")
32    sys.stderr.reconfigure(encoding="utf-8")
33except (AttributeError, OSError):
34    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
35import time
36from concurrent.futures import ThreadPoolExecutor, as_completed
37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
38from common import HelpOnErrorParser, get_clients, _clamp_score
39 
40 
41QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.
42 
43## Task
44Evaluate this training example for quality.
45 
46## User input (what the model receives)
47{user_content}
48 
49## Assistant output (what the model should learn to produce)
50{assistant_content}
51 
52## Scoring dimensions
53{dimensions_text}
54 
55Rate each dimension on a scale of 1-10.
56 
57Return ONLY a JSON object with dimension names as keys and integer scores as values.
58Example: {example_json}"""
59 
60 
61DEFAULT_DIMENSIONS = {
62    "correctness": "Is the assistant's output factually/functionally correct?",
63    "relevance": "Does the output directly address the user's request?",
64    "quality": "Is the output well-written, well-formatted, and professional?",
65}
66 
67 
68def score_example(client, model, user_content, assistant_content, dimensions):
69    """Score a single training example."""
70    dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())
71    example = {k: 8 for k in dimensions}
72 
73    prompt = QUALITY_PROMPT.format(
74        user_content=user_content[:2000],
75        assistant_content=assistant_content[:2000],
76        dimensions_text=dims_text,
77        example_json=json.dumps(example),
78    )
79 
80    for attempt in range(3):
81        try:
82            resp = client.chat.completions.create(
83                model=model,
84                messages=[{"role": "user", "content": prompt}],
85                temperature=0.0,
86                max_completion_tokens=200,
87            )
88            text = (resp.choices[0].message.content or "").strip()
89            match = re.search(r'\{[^}]+\}', text)
90            if match:
91                scores = json.loads(match.group())
92                return {k: _clamp_score(scores.get(k)) for k in dimensions}
93        except Exception:
94            if attempt < 2:
95                time.sleep(2)
96 
97    return {k: 0 for k in dimensions}
98 
99 
100def main():
101    parser = HelpOnErrorParser(description="Score training data quality with LLM judge")
102    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
103                        help="Project /v1/ URL (preferred)")
104    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
105                        help="Azure OpenAI endpoint (fallback)")
106    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
107                        help="Azure AI project endpoint (Foundry SDK)")
108    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
109    parser.add_argument("--model", default="gpt-4o", help="Judge model")
110    parser.add_argument("--input", required=True, help="Input JSONL file")
111    parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")
112    parser.add_argument("--min-score", type=float, default=None,
113                        help="Minimum average score to keep (filters below this)")
114    parser.add_argument("--dimensions", default=None,
115                        help="Comma-separated dimension names (default: correctness,relevance,quality)")
116    parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")
117    parser.add_argument("--strip-metadata", action="store_true",
118                        help="Remove _quality_scores and _avg_quality from output (safe for training input)")
119    args = parser.parse_args()
120 
121    client, method = get_clients(
122        base_url=args.base_url, azure_endpoint=args.endpoint,
123        project_endpoint=args.project_endpoint, api_key=args.api_key
124    )
125 
126    # Parse dimensions
127    if args.dimensions:
128        dim_names = [d.strip() for d in args.dimensions.split(",")]
129        dimensions = {d: f"Rate the {d} of the output" for d in dim_names}
130    else:
131        dimensions = DEFAULT_DIMENSIONS
132 
133    # Load data
134    examples = []
135    with open(args.input, encoding="utf-8") as f:
136        for i, line in enumerate(f):
137            if not line.strip():
138                continue
139            try:
140                ex = json.loads(line)
141            except json.JSONDecodeError as e:
142                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
143                continue
144            msgs = ex.get("messages", [])
145            user = next((m["content"] for m in msgs if m["role"] == "user"), "")
146            asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
147            examples.append({"data": ex, "user": user, "assistant": asst})
148 
149    print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")
150 
151    # Score in parallel
152    def score_one(idx):
153        ex = examples[idx]
154        scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)
155        return idx, scores
156 
157    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
158        futures = {pool.submit(score_one, i): i for i in range(len(examples))}
159        done = 0
160        for future in as_completed(futures):
161            idx, scores = future.result()
162            examples[idx]["scores"] = scores
163            done += 1
164            if done % 25 == 0:
165                print(f"  Scored {done}/{len(examples)}")
166 
167    # Calculate stats
168    all_avgs = []
169    for ex in examples:
170        scores = ex.get("scores", {})
171        if scores and any(v > 0 for v in scores.values()):
172            avg = sum(scores.values()) / len(scores)
173            ex["avg_score"] = avg
174            all_avgs.append(avg)
175 
176    if all_avgs:
177        print(f"\nQuality Distribution:")
178        print(f"  Mean:   {sum(all_avgs)/len(all_avgs):.1f}")
179        print(f"  Min:    {min(all_avgs):.1f}")
180        print(f"  Max:    {max(all_avgs):.1f}")
181        sorted_avgs = sorted(all_avgs)
182        n_avgs = len(sorted_avgs)
183        if n_avgs % 2 == 1:
184            median = sorted_avgs[n_avgs // 2]
185        else:
186            median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2
187        print(f"  Median: {median:.1f}")
188 
189    # Filter and write
190    kept = 0
191    filtered = 0
192    with open(args.output, "w", encoding="utf-8") as f:
193        for ex in examples:
194            if not args.strip_metadata:
195                ex["data"]["_quality_scores"] = ex.get("scores", {})
196                ex["data"]["_avg_quality"] = ex.get("avg_score", 0)
197 
198            if args.min_score and ex.get("avg_score", 0) < args.min_score:
199                filtered += 1
200                continue
201 
202            f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")
203            kept += 1
204 
205    print(f"\nKept: {kept}, Filtered: {filtered}")
206    if args.min_score:
207        print(f"(min_score threshold: {args.min_score})")
208    if args.strip_metadata:
209        print("(metadata stripped — output is safe for training input)")
210    print(f"Output: {args.output}")
211 
212 
213if __name__ == "__main__":
214    main()
215