| |
| """ |
| Generate a synthetic UltraChat-style SFT dataset. |
| |
| Each JSONL line has: |
| { |
| "prompt": "<first user message>", |
| "messages": [{"content": "...", "role": "user"}, {"content": "...", "role": "assistant"}, ...], |
| "prompt_id": "<sha256 hex of prompt>" |
| } |
| |
| Usage: |
| python scripts/generate_ultrachat_sft.py --out sample_data/train_sft.jsonl --n 500 |
| |
| Only uses Python standard library. |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import hashlib |
| import json |
| import os |
| import random |
| import sys |
| from typing import Any, Dict, List, Sequence |
|
|
| TOPICS = [ |
| "travel planning", |
| "cooking and recipes", |
| "software engineering", |
| "data science", |
| "machine learning", |
| "mathematics", |
| "history", |
| "literature", |
| "productivity", |
| "fitness and health", |
| "economics", |
| "marketing", |
| "photography", |
| "music theory", |
| "language learning", |
| "gardening", |
| "home improvement", |
| "career advice", |
| "public speaking", |
| "time management", |
| "networking", |
| "resume writing", |
| "interview prep", |
| "cloud computing", |
| "kubernetes", |
| "devops", |
| "cybersecurity", |
| "robotics", |
| "electronics", |
| "astronomy", |
| "climate and environment", |
| "education", |
| "parenting", |
| "pet care", |
| "mental wellness", |
| "philosophy", |
| "ethics", |
| "UX design", |
| "UI design", |
| "copywriting", |
| "game design", |
| "board games", |
| "coffee brewing", |
| "tea brewing", |
| "urban planning", |
| "transportation", |
| "sports science", |
| "statistics", |
| ] |
|
|
| USER_OPENERS = [ |
| "Could you help me with {topic}?", |
| "What are the key steps to get started with {topic}?", |
| "Give me a concise plan to improve at {topic} over 4 weeks.", |
| "Explain a few common pitfalls in {topic} and how to avoid them.", |
| "Draft a checklist for beginners in {topic}.", |
| "Compare two approaches commonly used in {topic} and when to choose each.", |
| "I have 2 hours to learn about {topic} today—what should I do?", |
| "What metrics matter most in {topic} and how do I track them?", |
| "Summarize an actionable framework for {topic} with examples.", |
| ] |
|
|
| FOLLOW_UPS = [ |
| "Nice. Can you add 3 concrete examples?", |
| "Could you turn that into a step-by-step guide?", |
| "What are likely failure modes and mitigations?", |
| "How would a beginner apply this in a weekend project?", |
| "Please give a small template I can reuse.", |
| "How do I measure progress over a month?", |
| ] |
|
|
| DEEPENERS = [ |
| "Great. Any tips to make it resilient under constraints?", |
| "How would you adapt this for a small team?", |
| "What are ethical considerations I should keep in mind?", |
| "What's an underrated practice here and why?", |
| ] |
|
|
| def make_answer(topic: str, opener: str) -> str: |
| bullets: List[str] = [ |
| f"Define your objective in {topic} (outcome, constraints, timeline).", |
| f"Map key concepts and tools in {topic}; pick one stack and stick to it for 2-4 weeks.", |
| f"Practice with a small, scoped project; iterate with feedback.", |
| f"Track 1-3 metrics that reflect real progress; review weekly.", |
| f"Document what worked, what didn't, and the next experiment.", |
| ] |
| return ( |
| f"Here's a pragmatic path for {topic} based on your request:\n" + |
| "\n".join([f"- {b}" for b in bullets]) + |
| "\n\nCommon pitfalls:\n- Starting too big; keep scope small.\n- Tool hopping; commit to one stack.\n- No feedback loop; schedule reviews.\n- Unclear metrics; define success upfront." |
| ) |
|
|
|
|
| def make_examples(topic: str) -> str: |
| ex: List[str] = [ |
| f"Example 1 — 90-minute sprint: Learn one core concept in {topic} and apply it to a toy task.", |
| f"Example 2 — Weekend project: Build a tiny demo that proves a single capability in {topic}.", |
| f"Example 3 — Peer review: Share results, gather feedback, and improve one dimension.", |
| ] |
| return "\n".join(ex) |
|
|
|
|
| def make_template(topic: str) -> str: |
| return ( |
| f"Template — One-page plan for {topic}:\n" |
| f"Goal: <1-2 sentences>\n" |
| f"Scope: <what's in / out>\n" |
| f"Resources: <3 links or docs>\n" |
| f"Milestones (weekly): <targets>\n" |
| f"Metrics: <quant/qual measures>\n" |
| f"Risks & mitigations: <top 3>\n" |
| ) |
|
|
|
|
| def rng_pick(r: random.Random, items: Sequence[str]) -> str: |
| return items[r.randrange(0, len(items))] |
|
|
|
|
| def make_dialogue(r: random.Random) -> Dict[str, Any]: |
| topic = rng_pick(r, TOPICS) |
| opener_tpl = rng_pick(r, USER_OPENERS) |
| opener = opener_tpl.format(topic=topic) |
|
|
| a1 = make_answer(topic, opener) |
|
|
| u2 = rng_pick(r, FOLLOW_UPS) |
| a2 = make_examples(topic) if "example" in u2.lower() else ( |
| make_template(topic) if "template" in u2.lower() else |
| "Here is a step-by-step variant focusing on small wins first, then scope up.\n1) Clarify outcome.\n2) Pick one method.\n3) Build tiny demo.\n4) Review metrics.\n5) Iterate." |
| ) |
|
|
| if r.random() < 0.6: |
| u3 = rng_pick(r, DEEPENERS) |
| a3 = ( |
| "Resilience tips:\n- Use checklists and pre-mortems.\n- Automate a single weak link each week.\n- Keep a rollback plan.\n- Prefer boring, well-documented tools.\n\nEthics: stay transparent, reduce bias, and respect privacy." |
| ) |
| messages: List[Dict[str, str]] = [ |
| {"content": opener, "role": "user"}, |
| {"content": a1, "role": "assistant"}, |
| {"content": u2, "role": "user"}, |
| {"content": a2, "role": "assistant"}, |
| {"content": u3, "role": "user"}, |
| {"content": a3, "role": "assistant"}, |
| ] |
| else: |
| messages = [ |
| {"content": opener, "role": "user"}, |
| {"content": a1, "role": "assistant"}, |
| {"content": rng_pick(r, FOLLOW_UPS), "role": "user"}, |
| {"content": a2, "role": "assistant"}, |
| ] |
|
|
| prompt = messages[0]["content"] |
| prompt_id = hashlib.sha256(prompt.encode("utf-8")).hexdigest() |
| return {"prompt": prompt, "messages": messages, "prompt_id": prompt_id} |
|
|
|
|
| def generate(n: int, seed: int) -> List[Dict[str, Any]]: |
| r = random.Random(seed) |
| return [make_dialogue(r) for _ in range(n)] |
|
|
|
|
| def main(argv: List[str]) -> int: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--out", type=str, default="sample_data/train_sft.jsonl") |
| parser.add_argument("--n", type=int, default=500) |
| parser.add_argument("--seed", type=int, default=42) |
| args = parser.parse_args(argv) |
|
|
| data = generate(args.n, args.seed) |
| out_path = args.out |
| os.makedirs(os.path.dirname(out_path), exist_ok=True) |
| with open(out_path, "w", encoding="utf-8") as f: |
| for row in data: |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") |
|
|
| print(f"Wrote {len(data)} rows to {out_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main(sys.argv[1:])) |
|
|