Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

155

Skill

n/a

Size

976.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/score_dataset.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code215 linesFree

finetuning/scripts/score_dataset.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8score_dataset.py — Assess training data quality using an LLM judge.
9 
10Scores each example on correctness and relevance, optionally filters
11out low-quality examples.
12 
13Usage:
14  # Score all examples
15  python score_dataset.py --input training.jsonl --output scored.jsonl
16 
17  # Score and filter (keep only score >= 7)
18  python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 7
19 
20  # Custom scoring dimensions
21  python score_dataset.py --input training.jsonl --output scored.jsonl \
22      --dimensions "correctness,clarity,completeness"
23"""
24 
25import json
26import os
27import re
28import sys
29 
30try:
31    sys.stdout.reconfigure(encoding="utf-8")
32    sys.stderr.reconfigure(encoding="utf-8")
33except (AttributeError, OSError):
34    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
35import time
36from concurrent.futures import ThreadPoolExecutor, as_completed
37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
38from common import HelpOnErrorParser, get_clients, _clamp_score
39 
40 
41QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.
42 
43## Task
44Evaluate this training example for quality.
45 
46## User input (what the model receives)
47{user_content}
48 
49## Assistant output (what the model should learn to produce)
50{assistant_content}
51 
52## Scoring dimensions
53{dimensions_text}
54 
55Rate each dimension on a scale of 1-10.
56 
57Return ONLY a JSON object with dimension names as keys and integer scores as values.
58Example: {example_json}"""
59 
60 
61DEFAULT_DIMENSIONS = {
62    "correctness": "Is the assistant's output factually/functionally correct?",
63    "relevance": "Does the output directly address the user's request?",
64    "quality": "Is the output well-written, well-formatted, and professional?",
65}
66 
67 
68def score_example(client, model, user_content, assistant_content, dimensions):
69    """Score a single training example."""
70    dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())
71    example = {k: 8 for k in dimensions}
72 
73    prompt = QUALITY_PROMPT.format(
74        user_content=user_content[:2000],
75        assistant_content=assistant_content[:2000],
76        dimensions_text=dims_text,
77        example_json=json.dumps(example),
78    )
79 
80    for attempt in range(3):
81        try:
82            resp = client.chat.completions.create(
83                model=model,
84                messages=[{"role": "user", "content": prompt}],
85                temperature=0.0,
86                max_completion_tokens=200,
87            )
88            text = (resp.choices[0].message.content or "").strip()
89            match = re.search(r'\{[^}]+\}', text)
90            if match:
91                scores = json.loads(match.group())
92                return {k: _clamp_score(scores.get(k)) for k in dimensions}
93        except Exception:
94            if attempt < 2:
95                time.sleep(2)
96 
97    return {k: 0 for k in dimensions}
98 
99 
100def main():
101    parser = HelpOnErrorParser(description="Score training data quality with LLM judge")
102    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
103                        help="Project /v1/ URL (preferred)")
104    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
105                        help="Azure OpenAI endpoint (fallback)")
106    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
107                        help="Azure AI project endpoint (Foundry SDK)")
108    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
109    parser.add_argument("--model", default="gpt-4o", help="Judge model")
110    parser.add_argument("--input", required=True, help="Input JSONL file")
111    parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")
112    parser.add_argument("--min-score", type=float, default=None,
113                        help="Minimum average score to keep (filters below this)")
114    parser.add_argument("--dimensions", default=None,
115                        help="Comma-separated dimension names (default: correctness,relevance,quality)")
116    parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")
117    parser.add_argument("--strip-metadata", action="store_true",
118                        help="Remove _quality_scores and _avg_quality from output (safe for training input)")
119    args = parser.parse_args()
120 
121    client, method = get_clients(
122        base_url=args.base_url, azure_endpoint=args.endpoint,
123        project_endpoint=args.project_endpoint, api_key=args.api_key
124    )
125 
126    # Parse dimensions
127    if args.dimensions:
128        dim_names = [d.strip() for d in args.dimensions.split(",")]
129        dimensions = {d: f"Rate the {d} of the output" for d in dim_names}
130    else:
131        dimensions = DEFAULT_DIMENSIONS
132 
133    # Load data
134    examples = []
135    with open(args.input, encoding="utf-8") as f:
136        for i, line in enumerate(f):
137            if not line.strip():
138                continue
139            try:
140                ex = json.loads(line)
141            except json.JSONDecodeError as e:
142                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
143                continue
144            msgs = ex.get("messages", [])
145            user = next((m["content"] for m in msgs if m["role"] == "user"), "")
146            asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
147            examples.append({"data": ex, "user": user, "assistant": asst})
148 
149    print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")
150 
151    # Score in parallel
152    def score_one(idx):
153        ex = examples[idx]
154        scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)
155        return idx, scores
156 
157    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
158        futures = {pool.submit(score_one, i): i for i in range(len(examples))}
159        done = 0
160        for future in as_completed(futures):
161            idx, scores = future.result()
162            examples[idx]["scores"] = scores
163            done += 1
164            if done % 25 == 0:
165                print(f"  Scored {done}/{len(examples)}")
166 
167    # Calculate stats
168    all_avgs = []
169    for ex in examples:
170        scores = ex.get("scores", {})
171        if scores and any(v > 0 for v in scores.values()):
172            avg = sum(scores.values()) / len(scores)
173            ex["avg_score"] = avg
174            all_avgs.append(avg)
175 
176    if all_avgs:
177        print(f"\nQuality Distribution:")
178        print(f"  Mean:   {sum(all_avgs)/len(all_avgs):.1f}")
179        print(f"  Min:    {min(all_avgs):.1f}")
180        print(f"  Max:    {max(all_avgs):.1f}")
181        sorted_avgs = sorted(all_avgs)
182        n_avgs = len(sorted_avgs)
183        if n_avgs % 2 == 1:
184            median = sorted_avgs[n_avgs // 2]
185        else:
186            median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2
187        print(f"  Median: {median:.1f}")
188 
189    # Filter and write
190    kept = 0
191    filtered = 0
192    with open(args.output, "w", encoding="utf-8") as f:
193        for ex in examples:
194            if not args.strip_metadata:
195                ex["data"]["_quality_scores"] = ex.get("scores", {})
196                ex["data"]["_avg_quality"] = ex.get("avg_score", 0)
197 
198            if args.min_score and ex.get("avg_score", 0) < args.min_score:
199                filtered += 1
200                continue
201 
202            f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")
203            kept += 1
204 
205    print(f"\nKept: {kept}, Filtered: {filtered}")
206    if args.min_score:
207        print(f"(min_score threshold: {args.min_score})")
208    if args.strip_metadata:
209        print("(metadata stripped — output is safe for training input)")
210    print(f"Output: {args.output}")
211 
212 
213if __name__ == "__main__":
214    main()
215

Loading source

Preparing the source view

Pulling the file list, source metadata, and syntax-aware rendering for this listing.

Marketplace

Source from repo

Microsoft Foundry Skill

Build and deploy AI applications on Azure AI Foundry using Microsoft's model catalog and AI services

microsoftGitHub microsoftOfficialSource repo Original GitHub link Publisher page

Files

155

Skill

n/a

Size

976.3 KB

Entrypoint

SKILL.md

Format

git-repo

Open file

finetuning/scripts/score_dataset.py

Syntax-highlighted preview of this file as included in the skill package.

Rendered Source

code215 linesFree

finetuning/scripts/score_dataset.py

1# /// script
2# dependencies = [
3#   "openai>=1.0",
4#   "azure-identity",
5# ]
6# ///
7"""
8score_dataset.py — Assess training data quality using an LLM judge.
9 
10Scores each example on correctness and relevance, optionally filters
11out low-quality examples.
12 
13Usage:
14  # Score all examples
15  python score_dataset.py --input training.jsonl --output scored.jsonl
16 
17  # Score and filter (keep only score >= 7)
18  python score_dataset.py --input training.jsonl --output filtered.jsonl --min-score 7
19 
20  # Custom scoring dimensions
21  python score_dataset.py --input training.jsonl --output scored.jsonl \
22      --dimensions "correctness,clarity,completeness"
23"""
24 
25import json
26import os
27import re
28import sys
29 
30try:
31    sys.stdout.reconfigure(encoding="utf-8")
32    sys.stderr.reconfigure(encoding="utf-8")
33except (AttributeError, OSError):
34    pass  # Stream not reconfigurable (older Python or non-tty); default encoding is fine
35import time
36from concurrent.futures import ThreadPoolExecutor, as_completed
37sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
38from common import HelpOnErrorParser, get_clients, _clamp_score
39 
40 
41QUALITY_PROMPT = """You are a data quality assessor for machine learning training data.
42 
43## Task
44Evaluate this training example for quality.
45 
46## User input (what the model receives)
47{user_content}
48 
49## Assistant output (what the model should learn to produce)
50{assistant_content}
51 
52## Scoring dimensions
53{dimensions_text}
54 
55Rate each dimension on a scale of 1-10.
56 
57Return ONLY a JSON object with dimension names as keys and integer scores as values.
58Example: {example_json}"""
59 
60 
61DEFAULT_DIMENSIONS = {
62    "correctness": "Is the assistant's output factually/functionally correct?",
63    "relevance": "Does the output directly address the user's request?",
64    "quality": "Is the output well-written, well-formatted, and professional?",
65}
66 
67 
68def score_example(client, model, user_content, assistant_content, dimensions):
69    """Score a single training example."""
70    dims_text = "\n".join(f"**{k}** (1-10): {v}" for k, v in dimensions.items())
71    example = {k: 8 for k in dimensions}
72 
73    prompt = QUALITY_PROMPT.format(
74        user_content=user_content[:2000],
75        assistant_content=assistant_content[:2000],
76        dimensions_text=dims_text,
77        example_json=json.dumps(example),
78    )
79 
80    for attempt in range(3):
81        try:
82            resp = client.chat.completions.create(
83                model=model,
84                messages=[{"role": "user", "content": prompt}],
85                temperature=0.0,
86                max_completion_tokens=200,
87            )
88            text = (resp.choices[0].message.content or "").strip()
89            match = re.search(r'\{[^}]+\}', text)
90            if match:
91                scores = json.loads(match.group())
92                return {k: _clamp_score(scores.get(k)) for k in dimensions}
93        except Exception:
94            if attempt < 2:
95                time.sleep(2)
96 
97    return {k: 0 for k in dimensions}
98 
99 
100def main():
101    parser = HelpOnErrorParser(description="Score training data quality with LLM judge")
102    parser.add_argument("--base-url", default=os.environ.get("OPENAI_BASE_URL"),
103                        help="Project /v1/ URL (preferred)")
104    parser.add_argument("--endpoint", default=os.environ.get("AZURE_OPENAI_ENDPOINT"),
105                        help="Azure OpenAI endpoint (fallback)")
106    parser.add_argument("--project-endpoint", default=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
107                        help="Azure AI project endpoint (Foundry SDK)")
108    parser.add_argument("--api-key", default=os.environ.get("AZURE_OPENAI_API_KEY"))
109    parser.add_argument("--model", default="gpt-4o", help="Judge model")
110    parser.add_argument("--input", required=True, help="Input JSONL file")
111    parser.add_argument("--output", required=True, help="Output JSONL file (with scores)")
112    parser.add_argument("--min-score", type=float, default=None,
113                        help="Minimum average score to keep (filters below this)")
114    parser.add_argument("--dimensions", default=None,
115                        help="Comma-separated dimension names (default: correctness,relevance,quality)")
116    parser.add_argument("--concurrency", type=int, default=4, help="Parallel scoring workers")
117    parser.add_argument("--strip-metadata", action="store_true",
118                        help="Remove _quality_scores and _avg_quality from output (safe for training input)")
119    args = parser.parse_args()
120 
121    client, method = get_clients(
122        base_url=args.base_url, azure_endpoint=args.endpoint,
123        project_endpoint=args.project_endpoint, api_key=args.api_key
124    )
125 
126    # Parse dimensions
127    if args.dimensions:
128        dim_names = [d.strip() for d in args.dimensions.split(",")]
129        dimensions = {d: f"Rate the {d} of the output" for d in dim_names}
130    else:
131        dimensions = DEFAULT_DIMENSIONS
132 
133    # Load data
134    examples = []
135    with open(args.input, encoding="utf-8") as f:
136        for i, line in enumerate(f):
137            if not line.strip():
138                continue
139            try:
140                ex = json.loads(line)
141            except json.JSONDecodeError as e:
142                print(f"⚠️ Skipping malformed JSON on line {i+1}: {e}")
143                continue
144            msgs = ex.get("messages", [])
145            user = next((m["content"] for m in msgs if m["role"] == "user"), "")
146            asst = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
147            examples.append({"data": ex, "user": user, "assistant": asst})
148 
149    print(f"Loaded {len(examples)} examples. Scoring with {args.model}...")
150 
151    # Score in parallel
152    def score_one(idx):
153        ex = examples[idx]
154        scores = score_example(client, args.model, ex["user"], ex["assistant"], dimensions)
155        return idx, scores
156 
157    with ThreadPoolExecutor(max_workers=args.concurrency) as pool:
158        futures = {pool.submit(score_one, i): i for i in range(len(examples))}
159        done = 0
160        for future in as_completed(futures):
161            idx, scores = future.result()
162            examples[idx]["scores"] = scores
163            done += 1
164            if done % 25 == 0:
165                print(f"  Scored {done}/{len(examples)}")
166 
167    # Calculate stats
168    all_avgs = []
169    for ex in examples:
170        scores = ex.get("scores", {})
171        if scores and any(v > 0 for v in scores.values()):
172            avg = sum(scores.values()) / len(scores)
173            ex["avg_score"] = avg
174            all_avgs.append(avg)
175 
176    if all_avgs:
177        print(f"\nQuality Distribution:")
178        print(f"  Mean:   {sum(all_avgs)/len(all_avgs):.1f}")
179        print(f"  Min:    {min(all_avgs):.1f}")
180        print(f"  Max:    {max(all_avgs):.1f}")
181        sorted_avgs = sorted(all_avgs)
182        n_avgs = len(sorted_avgs)
183        if n_avgs % 2 == 1:
184            median = sorted_avgs[n_avgs // 2]
185        else:
186            median = (sorted_avgs[n_avgs // 2 - 1] + sorted_avgs[n_avgs // 2]) / 2
187        print(f"  Median: {median:.1f}")
188 
189    # Filter and write
190    kept = 0
191    filtered = 0
192    with open(args.output, "w", encoding="utf-8") as f:
193        for ex in examples:
194            if not args.strip_metadata:
195                ex["data"]["_quality_scores"] = ex.get("scores", {})
196                ex["data"]["_avg_quality"] = ex.get("avg_score", 0)
197 
198            if args.min_score and ex.get("avg_score", 0) < args.min_score:
199                filtered += 1
200                continue
201 
202            f.write(json.dumps(ex["data"], ensure_ascii=False) + "\n")
203            kept += 1
204 
205    print(f"\nKept: {kept}, Filtered: {filtered}")
206    if args.min_score:
207        print(f"(min_score threshold: {args.min_score})")
208    if args.strip_metadata:
209        print("(metadata stripped — output is safe for training input)")
210    print(f"Output: {args.output}")
211 
212 
213if __name__ == "__main__":
214    main()
215