Agent-JAE/default-skills/venice-chat-benchmark/scripts/benchmark.py

#!/usr/bin/env python3
"""Venice Chat Model Benchmark - Tests chat completions with tool_choice.

Usage:
    python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir
    python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir --infographic
"""

import argparse
import json
import os
import subprocess
import sys
import time
import statistics
from datetime import datetime

import requests

API_URL = "https://api.venice.ai/api/v1/chat/completions"

# === COMPLEX TOOL_CHOICE PAYLOAD (Travel Planning) ===

SYSTEM_PROMPT = """You are an expert travel planning assistant. You MUST call exactly ONE tool on every response. Never respond with plain text. Your response IS the tool call.

Available tools:
- set_travel_dates: Record travel dates
- set_secondary_destinations: Record destinations
- set_traveler_info: Record traveler details
- set_travel_priorities: Record priorities
- set_budget: Record budget
- present_choices: Show clickable choices
- suggest_primary_destinations: Show destination cards

Collect dates first, then travelers, then destinations. Pre-fill from conversation context.

Current itinerary context:
No itinerary data yet."""

USER_MESSAGE = "My wife and I want to plan a 2-week trip to Japan this October. We love food, temples, and hiking. Mid-range budget around $6000."

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "set_travel_dates",
            "description": "Set the travel dates for the trip. Opens an interactive date picker.",
            "parameters": {
                "type": "object",
                "properties": {
                    "start_date": {"type": "string", "description": "Trip start date YYYY-MM-DD"},
                    "end_date": {"type": "string", "description": "Trip end date YYYY-MM-DD"},
                    "flexible": {"type": "boolean", "description": "Whether dates are flexible"}
                },
                "required": ["start_date", "end_date"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "set_secondary_destinations",
            "description": "Set trip destinations with secondary options.",
            "parameters": {
                "type": "object",
                "properties": {
                    "description": {"type": "string", "description": "Overview of why these destinations fit"},
                    "primary": {"type": "string", "description": "Primary destination"},
                    "secondary": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "transit": {"type": "string"}
                            },
                            "required": ["name", "transit"]
                        },
                        "description": "4-5 nearby destinations"
                    }
                },
                "required": ["description", "primary", "secondary"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "set_traveler_info",
            "description": "Capture traveler information.",
            "parameters": {
                "type": "object",
                "properties": {
                    "description": {"type": "string", "description": "Trip vibe and goals"},
                    "count": {"type": "integer", "description": "Number of travelers"},
                    "interests": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Interest IDs: adventure, hiking, culture, food, street_food, fine_dining, nature, romantic, etc."
                    }
                },
                "required": ["count"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "set_travel_priorities",
            "description": "Set what matters most for this trip.",
            "parameters": {
                "type": "object",
                "properties": {
                    "ranked": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Priorities in order: comfort, budget, adventure, culture, food, nature, romantic"
                    }
                },
                "required": ["ranked"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "set_budget",
            "description": "Set the trip budget.",
            "parameters": {
                "type": "object",
                "properties": {
                    "total": {"type": "number", "description": "Total budget"},
                    "currency": {"type": "string", "description": "Currency code"}
                },
                "required": ["total", "currency"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "present_choices",
            "description": "Present clickable choices to the user.",
            "parameters": {
                "type": "object",
                "properties": {
                    "message": {"type": "string", "description": "Question to display"},
                    "choices": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "label": {"type": "string"},
                                "description": {"type": "string"}
                            },
                            "required": ["label"]
                        }
                    }
                },
                "required": ["message", "choices"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "suggest_primary_destinations",
            "description": "Present rich destination suggestions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "message": {"type": "string", "description": "Heading above cards"},
                    "destinations": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "tagline": {"type": "string"}
                            },
                            "required": ["name", "tagline"]
                        }
                    }
                },
                "required": ["message", "destinations"]
            }
        }
    }
]


def make_request(api_key, model, timeout=120):
    """Make a single chat completion request with tools."""
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_MESSAGE}
        ],
        "tools": TOOLS,
        "tool_choice": "auto",
        "temperature": 0.7,
        "stream": False
    }

    resp = requests.post(API_URL, headers=headers, json=payload, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


def parse_response(data):
    """Parse the API response and extract key info."""
    choice = data.get("choices", [{}])[0]
    msg = choice.get("message", {})
    finish_reason = choice.get("finish_reason") or "unknown"
    usage = data.get("usage", {})

    result = {
        "finish_reason": finish_reason,
        "has_tool_calls": bool(msg.get("tool_calls")),
        "tool_calls": [],
        "content": msg.get("content"),
        "usage": usage,
    }

    if msg.get("tool_calls"):
        for tc in msg["tool_calls"]:
            tool_info = {
                "id": tc.get("id", ""),
                "name": tc["function"]["name"],
                "arguments_raw": tc["function"]["arguments"],
            }
            try:
                tool_info["arguments_parsed"] = json.loads(tc["function"]["arguments"])
                tool_info["args_valid_json"] = True
            except (json.JSONDecodeError, TypeError) as e:
                tool_info["arguments_parsed"] = None
                tool_info["args_valid_json"] = False
                tool_info["json_error"] = str(e)
            result["tool_calls"].append(tool_info)

    return result


def run_benchmark(api_key, model, num_runs, output_dir, timeout=120):
    """Run the full benchmark."""
    os.makedirs(output_dir, exist_ok=True)

    print(f"{'='*70}")
    print(f"VENICE CHAT BENCHMARK — Tool Choice Stress Test")
    print(f"{'='*70}")
    print(f"Model:        {model}")
    print(f"Runs:         {num_runs}")
    print(f"Timeout:      {timeout}s per request")
    print(f"Tools:        {len(TOOLS)} tools defined")
    print(f"Tool choice:  auto")
    print(f"Started:      {datetime.now().isoformat()}")
    print(f"{'='*70}\n")

    results = {
        "metadata": {
            "model": model,
            "num_runs": num_runs,
            "timeout": timeout,
            "num_tools": len(TOOLS),
            "tool_names": [t["function"]["name"] for t in TOOLS],
            "tool_choice": "auto",
            "system_prompt": SYSTEM_PROMPT,
            "user_message": USER_MESSAGE,
            "start_time": datetime.now().isoformat(),
        },
        "runs": [],
        "stats": {},
    }

    successful_times = []
    tool_call_counts = {}  # which tools get called
    finish_reasons = {}
    errors_list = []

    for run_num in range(1, num_runs + 1):
        run_data = {
            "run": run_num,
            "start_time": datetime.now().isoformat(),
            "success": False,
            "duration_seconds": None,
            "error": None,
            "error_type": None,
            "http_status": None,
            "finish_reason": None,
            "has_tool_calls": False,
            "tool_calls": [],
            "content": None,
            "usage": {},
            "args_valid_json": True,
        }

        try:
            start = time.time()
            raw_response = make_request(api_key, model, timeout=timeout)
            elapsed = time.time() - start

            parsed = parse_response(raw_response)

            run_data["success"] = True
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["http_status"] = 200
            run_data["finish_reason"] = parsed["finish_reason"] or "none"
            run_data["has_tool_calls"] = parsed["has_tool_calls"]
            run_data["tool_calls"] = parsed["tool_calls"]
            run_data["content"] = parsed["content"]
            run_data["usage"] = parsed["usage"]

            # Check if all tool call args are valid JSON
            all_valid = all(tc.get("args_valid_json", False) for tc in parsed["tool_calls"]) if parsed["tool_calls"] else True
            run_data["args_valid_json"] = all_valid

            successful_times.append(elapsed)

            # Track tool call distribution
            fr = parsed["finish_reason"] or "none"
            finish_reasons[fr] = finish_reasons.get(fr, 0) + 1

            for tc in parsed["tool_calls"]:
                tn = tc["name"]
                tool_call_counts[tn] = tool_call_counts.get(tn, 0) + 1

            # Display
            tool_names = ", ".join(tc["name"] for tc in parsed["tool_calls"]) if parsed["tool_calls"] else "NONE"
            json_ok = "✓" if all_valid else "✗ BAD JSON"
            content_flag = " +content" if parsed["content"] else ""
            print(f"  ✅ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {str(fr):<12} | tools: {tool_names} | json: {json_ok}{content_flag}")

        except requests.exceptions.HTTPError as e:
            elapsed = time.time() - start
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["error"] = str(e)[:500]
            run_data["error_type"] = "http_error"
            status = None
            try:
                status = e.response.status_code if e.response is not None else None
            except:
                pass
            run_data["http_status"] = status
            try:
                err_body = e.response.json() if e.response is not None else {}
                run_data["error_body"] = err_body
                run_data["error"] = json.dumps(err_body)[:500]
            except:
                run_data["error_body"] = {}
            errors_list.append({"run": run_num, "type": "http_error", "status": status, "error": run_data["error"][:200]})
            print(f"  ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | HTTP {status or "???"} - {run_data['error'][:100]}")

        except requests.exceptions.Timeout as e:
            elapsed = time.time() - start
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["error"] = f"Request timed out after {timeout}s"
            run_data["error_type"] = "timeout"
            errors_list.append({"run": run_num, "type": "timeout", "error": run_data["error"]})
            print(f"  ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | TIMEOUT")

        except requests.exceptions.ConnectionError as e:
            elapsed = time.time() - start
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["error"] = str(e)[:500]
            run_data["error_type"] = "connection_error"
            errors_list.append({"run": run_num, "type": "connection_error", "error": str(e)[:200]})
            print(f"  ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | CONNECTION ERROR - {str(e)[:80]}")

        except json.JSONDecodeError as e:
            elapsed = time.time() - start
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["error"] = f"Invalid JSON response: {str(e)[:200]}"
            run_data["error_type"] = "json_decode_error"
            errors_list.append({"run": run_num, "type": "json_decode_error", "error": str(e)[:200]})
            print(f"  ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | JSON DECODE ERROR")

        except Exception as e:
            elapsed = time.time() - start
            run_data["duration_seconds"] = round(elapsed, 3)
            run_data["error"] = str(e)[:500]
            run_data["error_type"] = type(e).__name__
            errors_list.append({"run": run_num, "type": type(e).__name__, "error": str(e)[:200]})
            print(f"  ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {type(e).__name__}: {str(e)[:80]}")

        run_data["end_time"] = datetime.now().isoformat()
        results["runs"].append(run_data)

        # Save intermediate results
        with open(f"{output_dir}/benchmark_results.json", "w") as f:
            json.dump(results, f, indent=2)

    # === COMPUTE STATS ===
    successful_runs = [r for r in results["runs"] if r["success"]]
    failed_runs = [r for r in results["runs"] if not r["success"]]
    tool_call_runs = [r for r in successful_runs if r["has_tool_calls"]]
    no_tool_runs = [r for r in successful_runs if not r["has_tool_calls"]]
    bad_json_runs = [r for r in successful_runs if not r["args_valid_json"]]
    content_runs = [r for r in successful_runs if r["content"]]

    stats = {
        "total_runs": num_runs,
        "successful_runs": len(successful_runs),
        "failed_runs": len(failed_runs),
        "success_rate": round(len(successful_runs) / num_runs * 100, 1),
        "tool_call_runs": len(tool_call_runs),
        "tool_call_rate": round(len(tool_call_runs) / len(successful_runs) * 100, 1) if successful_runs else 0,
        "no_tool_runs": len(no_tool_runs),
        "bad_json_runs": len(bad_json_runs),
        "json_validity_rate": round((len(tool_call_runs) - len(bad_json_runs)) / len(tool_call_runs) * 100, 1) if tool_call_runs else 0,
        "content_with_tool_calls": len([r for r in tool_call_runs if r["content"]]),
        "tool_call_distribution": tool_call_counts,
        "finish_reasons": finish_reasons,
        "errors": errors_list,
    }

    if successful_times:
        stats["timing"] = {
            "avg": round(statistics.mean(successful_times), 3),
            "median": round(statistics.median(successful_times), 3),
            "min": round(min(successful_times), 3),
            "max": round(max(successful_times), 3),
            "stdev": round(statistics.stdev(successful_times), 3) if len(successful_times) > 1 else 0,
            "p90": round(sorted(successful_times)[int(len(successful_times) * 0.9)], 3) if len(successful_times) >= 10 else None,
            "p95": round(sorted(successful_times)[int(len(successful_times) * 0.95)], 3) if len(successful_times) >= 20 else None,
            "p99": round(sorted(successful_times)[int(len(successful_times) * 0.99)], 3) if len(successful_times) >= 100 else None,
        }

    # Usage stats
    if successful_runs:
        prompt_tokens = [r["usage"].get("prompt_tokens", 0) for r in successful_runs if r["usage"]]
        completion_tokens = [r["usage"].get("completion_tokens", 0) for r in successful_runs if r["usage"]]
        total_tokens = [r["usage"].get("total_tokens", 0) for r in successful_runs if r["usage"]]
        if prompt_tokens:
            stats["token_usage"] = {
                "avg_prompt_tokens": round(statistics.mean(prompt_tokens)),
                "avg_completion_tokens": round(statistics.mean(completion_tokens)),
                "avg_total_tokens": round(statistics.mean(total_tokens)),
                "total_prompt_tokens": sum(prompt_tokens),
                "total_completion_tokens": sum(completion_tokens),
                "total_all_tokens": sum(total_tokens),
            }

    results["stats"] = stats
    results["metadata"]["end_time"] = datetime.now().isoformat()

    # Save final results
    with open(f"{output_dir}/benchmark_results.json", "w") as f:
        json.dump(results, f, indent=2)

    # === PRINT SUMMARY ===
    print(f"\n{'='*70}")
    print(f"BENCHMARK COMPLETE — {model}")
    print(f"{'='*70}")
    print(f"\n📊 Results Summary:")
    print(f"  Total runs:           {num_runs}")
    print(f"  Successful:           {stats['successful_runs']} ({stats['success_rate']}%)")
    print(f"  Failed:               {stats['failed_runs']}")
    print(f"  Tool call rate:       {stats['tool_call_rate']}% of successful runs")
    print(f"  JSON validity:        {stats['json_validity_rate']}% of tool calls")
    print(f"  Bad JSON args:        {stats['bad_json_runs']}")
    print(f"  Content + tool call:  {stats['content_with_tool_calls']} (ideally 0)")

    if "timing" in stats:
        t = stats["timing"]
        print(f"\n⏱️  Timing:")
        print(f"  Average:    {t['avg']}s")
        print(f"  Median:     {t['median']}s")
        print(f"  Min:        {t['min']}s")
        print(f"  Max:        {t['max']}s")
        print(f"  Std Dev:    {t['stdev']}s")
        if t.get("p90"): print(f"  P90:        {t['p90']}s")
        if t.get("p95"): print(f"  P95:        {t['p95']}s")

    if tool_call_counts:
        print(f"\n🔧 Tool Call Distribution:")
        for tn, count in sorted(tool_call_counts.items(), key=lambda x: x[1], reverse=True):
            pct = round(count / sum(tool_call_counts.values()) * 100, 1)
            bar = "█" * int(pct / 2)
            print(f"  {tn:<35} {count:3d} ({pct:5.1f}%) {bar}")

    if finish_reasons:
        print(f"\n🏁 Finish Reasons:")
        for fr, count in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True):
            print(f"  {str(fr or "none"):<20} {count:3d}")

    if errors_list:
        print(f"\n⚠️  Errors ({len(errors_list)}):")
        # Group by type
        error_types = {}
        for e in errors_list:
            et = e["type"]
            error_types[et] = error_types.get(et, 0) + 1
        for et, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
            print(f"  {et}: {count}")
        # Show first 5 unique errors
        seen = set()
        for e in errors_list:
            key = e["error"][:100]
            if key not in seen:
                seen.add(key)
                print(f"  Run {e['run']}: [{e['type']}] {e['error'][:150]}")
                if len(seen) >= 5:
                    break

    if "token_usage" in stats:
        tu = stats["token_usage"]
        print(f"\n🪙 Token Usage:")
        print(f"  Avg prompt:     {tu['avg_prompt_tokens']}")
        print(f"  Avg completion: {tu['avg_completion_tokens']}")
        print(f"  Avg total:      {tu['avg_total_tokens']}")
        print(f"  Grand total:    {tu['total_all_tokens']}")

    print(f"\n📁 Results: {output_dir}/benchmark_results.json")
    return results


def generate_infographic(output_dir, api_key):
    """Generate a 4K infographic from benchmark results."""
    with open(f"{output_dir}/benchmark_results.json") as f:
        data = json.load(f)

    stats = data["stats"]
    meta = data["metadata"]
    timing = stats.get("timing", {})
    tool_dist = stats.get("tool_call_distribution", {})
    token_usage = stats.get("token_usage", {})
    errors = stats.get("errors", [])
    finish_reasons = stats.get("finish_reasons", {})

    # Build tool distribution text
    tool_lines = []
    if tool_dist:
        total_calls = sum(tool_dist.values())
        for tn, count in sorted(tool_dist.items(), key=lambda x: x[1], reverse=True):
            pct = round(count / total_calls * 100, 1)
            tool_lines.append(f"{tn}: {count} calls ({pct}%)")
    tool_text = ", ".join(tool_lines) if tool_lines else "No tool calls"

    # Finish reasons text
    fr_text = ", ".join(f"{k}: {v}" for k, v in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True))

    # Error summary
    error_types = {}
    for e in errors:
        error_types[e["type"]] = error_types.get(e["type"], 0) + 1
    error_text = ", ".join(f"{k}: {v}" for k, v in error_types.items()) if error_types else "No errors"

    prompt = f"""Premium dark-themed data infographic titled 'VENICE AI CHAT BENCHMARK' with subtitle 'Tool Choice Stress Test — {meta["model"]} — {stats["total_runs"]} Runs — {meta.get("start_time","")[:10]}'. Sleek modern design with dark navy-black background, neon green and electric cyan accent colors, glowing AI circuit patterns.

Layout: TOP SECTION: Large glowing title banner with AI brain icon. Key stats row: '{stats["total_runs"]} Total Runs' '{stats["success_rate"]}% Success Rate' '{stats["tool_call_rate"]}% Tool Call Rate' '{stats["json_validity_rate"]}% JSON Valid' '{len(meta.get("tool_names",[]))} Tools Defined'.

MIDDLE LEFT: Performance gauge showing Average Response Time {timing.get("avg","N/A")}s, Median {timing.get("median","N/A")}s, Min {timing.get("min","N/A")}s, Max {timing.get("max","N/A")}s, StdDev {timing.get("stdev","N/A")}s, P90 {timing.get("p90","N/A")}s.

MIDDLE RIGHT: Horizontal bar chart of Tool Call Distribution: {tool_text}. Bars in gradient neon colors.

BOTTOM LEFT: Reliability metrics: {stats["successful_runs"]} successful, {stats["failed_runs"]} failed, {stats["bad_json_runs"]} bad JSON responses, {stats["content_with_tool_calls"]} responses had content alongside tool calls. Finish reasons: {fr_text}.

BOTTOM CENTER: Token usage stats: Avg prompt {token_usage.get("avg_prompt_tokens","N/A")} tokens, Avg completion {token_usage.get("avg_completion_tokens","N/A")} tokens, Total {token_usage.get("total_all_tokens","N/A")} tokens across all runs.

BOTTOM RIGHT: Error breakdown: {error_text}.

All text crisp and legible, professional data dashboard style, glowing neon data points, subtle encryption circuit patterns in background. Model name '{meta["model"]}' prominently displayed."""

    print(f"\n🎨 Generating 4K infographic...")
    img_output = f"{output_dir}/benchmark_infographic"

    cmd = [
        "python", "~/.jae/agent/skills/venice-image-gen/scripts/generate_image.py",
        prompt,
        "--resolution", "4K",
        "--aspect_ratio", "16:9",
        "--format", "png",
        "--output", img_output
    ]

    env = os.environ.copy()
    env["VENICE_API_KEY"] = api_key

    result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=120)
    print(result.stdout)
    if result.stderr:
        print(result.stderr)

    if result.returncode == 0:
        print(f"✅ Infographic saved to: {img_output}.png")
    else:
        print(f"❌ Infographic generation failed (exit code {result.returncode})")

    return result.returncode == 0


def main():
    parser = argparse.ArgumentParser(description="Venice Chat Model Benchmark")
    parser.add_argument("--model", default="minimax-m27", help="Model ID to test")
    parser.add_argument("--runs", type=int, default=50, help="Number of runs")
    parser.add_argument("--timeout", type=int, default=120, help="Request timeout in seconds")
    parser.add_argument("--output", default="~/chat_benchmark", help="Output directory")
    parser.add_argument("--infographic", action="store_true", help="Generate 4K infographic")
    args = parser.parse_args()

    api_key = os.environ.get("VENICE_API_KEY", "")
    if not api_key:
        print("ERROR: VENICE_API_KEY environment variable not set")
        sys.exit(1)

    results = run_benchmark(api_key, args.model, args.runs, args.output, args.timeout)

    if args.infographic:
        generate_infographic(args.output, api_key)


if __name__ == "__main__":
    main()