Agent-JAE/default-skills/venice-chat-benchmark/scripts/benchmark.py
jae 19b25341bd
Some checks are pending
CI / build-check-test (push) Waiting to run
feat: add 11 Venice AI skills as bundled defaults
Skills included:
- venice-chat: Chat with Venice LLM models, vision, reasoning
- venice-chat-benchmark: Benchmark chat models with infographics
- venice-image-gen: Generate images via Venice API
- venice-list-image-models: List available image models
- venice-list-text-models: List available text models
- venice-list-video-models: List available video models
- venice-tts: Text-to-speech via Venice API
- venice-video-generate: Generate videos from text/images
- venice-video-queue: Queue video generation jobs
- venice-video-quote: Get video generation cost quotes
- venice-video-retrieve: Retrieve completed videos

All rebranded from Agent Zero paths to Agent JAE (~/.jae/agent/skills/).
Requires VENICE_API_KEY environment variable.
2026-03-23 18:46:23 +01:00

618 lines
25 KiB
Python

#!/usr/bin/env python3
"""Venice Chat Model Benchmark - Tests chat completions with tool_choice.
Usage:
python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir
python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir --infographic
"""
import argparse
import json
import os
import subprocess
import sys
import time
import statistics
from datetime import datetime
import requests
API_URL = "https://api.venice.ai/api/v1/chat/completions"
# === COMPLEX TOOL_CHOICE PAYLOAD (Travel Planning) ===
SYSTEM_PROMPT = """You are an expert travel planning assistant. You MUST call exactly ONE tool on every response. Never respond with plain text. Your response IS the tool call.
Available tools:
- set_travel_dates: Record travel dates
- set_secondary_destinations: Record destinations
- set_traveler_info: Record traveler details
- set_travel_priorities: Record priorities
- set_budget: Record budget
- present_choices: Show clickable choices
- suggest_primary_destinations: Show destination cards
Collect dates first, then travelers, then destinations. Pre-fill from conversation context.
Current itinerary context:
No itinerary data yet."""
USER_MESSAGE = "My wife and I want to plan a 2-week trip to Japan this October. We love food, temples, and hiking. Mid-range budget around $6000."
TOOLS = [
{
"type": "function",
"function": {
"name": "set_travel_dates",
"description": "Set the travel dates for the trip. Opens an interactive date picker.",
"parameters": {
"type": "object",
"properties": {
"start_date": {"type": "string", "description": "Trip start date YYYY-MM-DD"},
"end_date": {"type": "string", "description": "Trip end date YYYY-MM-DD"},
"flexible": {"type": "boolean", "description": "Whether dates are flexible"}
},
"required": ["start_date", "end_date"]
}
}
},
{
"type": "function",
"function": {
"name": "set_secondary_destinations",
"description": "Set trip destinations with secondary options.",
"parameters": {
"type": "object",
"properties": {
"description": {"type": "string", "description": "Overview of why these destinations fit"},
"primary": {"type": "string", "description": "Primary destination"},
"secondary": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"transit": {"type": "string"}
},
"required": ["name", "transit"]
},
"description": "4-5 nearby destinations"
}
},
"required": ["description", "primary", "secondary"]
}
}
},
{
"type": "function",
"function": {
"name": "set_traveler_info",
"description": "Capture traveler information.",
"parameters": {
"type": "object",
"properties": {
"description": {"type": "string", "description": "Trip vibe and goals"},
"count": {"type": "integer", "description": "Number of travelers"},
"interests": {
"type": "array",
"items": {"type": "string"},
"description": "Interest IDs: adventure, hiking, culture, food, street_food, fine_dining, nature, romantic, etc."
}
},
"required": ["count"]
}
}
},
{
"type": "function",
"function": {
"name": "set_travel_priorities",
"description": "Set what matters most for this trip.",
"parameters": {
"type": "object",
"properties": {
"ranked": {
"type": "array",
"items": {"type": "string"},
"description": "Priorities in order: comfort, budget, adventure, culture, food, nature, romantic"
}
},
"required": ["ranked"]
}
}
},
{
"type": "function",
"function": {
"name": "set_budget",
"description": "Set the trip budget.",
"parameters": {
"type": "object",
"properties": {
"total": {"type": "number", "description": "Total budget"},
"currency": {"type": "string", "description": "Currency code"}
},
"required": ["total", "currency"]
}
}
},
{
"type": "function",
"function": {
"name": "present_choices",
"description": "Present clickable choices to the user.",
"parameters": {
"type": "object",
"properties": {
"message": {"type": "string", "description": "Question to display"},
"choices": {
"type": "array",
"items": {
"type": "object",
"properties": {
"label": {"type": "string"},
"description": {"type": "string"}
},
"required": ["label"]
}
}
},
"required": ["message", "choices"]
}
}
},
{
"type": "function",
"function": {
"name": "suggest_primary_destinations",
"description": "Present rich destination suggestions.",
"parameters": {
"type": "object",
"properties": {
"message": {"type": "string", "description": "Heading above cards"},
"destinations": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"tagline": {"type": "string"}
},
"required": ["name", "tagline"]
}
}
},
"required": ["message", "destinations"]
}
}
}
]
def make_request(api_key, model, timeout=120):
"""Make a single chat completion request with tools."""
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
payload = {
"model": model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": USER_MESSAGE}
],
"tools": TOOLS,
"tool_choice": "auto",
"temperature": 0.7,
"stream": False
}
resp = requests.post(API_URL, headers=headers, json=payload, timeout=timeout)
resp.raise_for_status()
return resp.json()
def parse_response(data):
"""Parse the API response and extract key info."""
choice = data.get("choices", [{}])[0]
msg = choice.get("message", {})
finish_reason = choice.get("finish_reason") or "unknown"
usage = data.get("usage", {})
result = {
"finish_reason": finish_reason,
"has_tool_calls": bool(msg.get("tool_calls")),
"tool_calls": [],
"content": msg.get("content"),
"usage": usage,
}
if msg.get("tool_calls"):
for tc in msg["tool_calls"]:
tool_info = {
"id": tc.get("id", ""),
"name": tc["function"]["name"],
"arguments_raw": tc["function"]["arguments"],
}
try:
tool_info["arguments_parsed"] = json.loads(tc["function"]["arguments"])
tool_info["args_valid_json"] = True
except (json.JSONDecodeError, TypeError) as e:
tool_info["arguments_parsed"] = None
tool_info["args_valid_json"] = False
tool_info["json_error"] = str(e)
result["tool_calls"].append(tool_info)
return result
def run_benchmark(api_key, model, num_runs, output_dir, timeout=120):
"""Run the full benchmark."""
os.makedirs(output_dir, exist_ok=True)
print(f"{'='*70}")
print(f"VENICE CHAT BENCHMARK — Tool Choice Stress Test")
print(f"{'='*70}")
print(f"Model: {model}")
print(f"Runs: {num_runs}")
print(f"Timeout: {timeout}s per request")
print(f"Tools: {len(TOOLS)} tools defined")
print(f"Tool choice: auto")
print(f"Started: {datetime.now().isoformat()}")
print(f"{'='*70}\n")
results = {
"metadata": {
"model": model,
"num_runs": num_runs,
"timeout": timeout,
"num_tools": len(TOOLS),
"tool_names": [t["function"]["name"] for t in TOOLS],
"tool_choice": "auto",
"system_prompt": SYSTEM_PROMPT,
"user_message": USER_MESSAGE,
"start_time": datetime.now().isoformat(),
},
"runs": [],
"stats": {},
}
successful_times = []
tool_call_counts = {} # which tools get called
finish_reasons = {}
errors_list = []
for run_num in range(1, num_runs + 1):
run_data = {
"run": run_num,
"start_time": datetime.now().isoformat(),
"success": False,
"duration_seconds": None,
"error": None,
"error_type": None,
"http_status": None,
"finish_reason": None,
"has_tool_calls": False,
"tool_calls": [],
"content": None,
"usage": {},
"args_valid_json": True,
}
try:
start = time.time()
raw_response = make_request(api_key, model, timeout=timeout)
elapsed = time.time() - start
parsed = parse_response(raw_response)
run_data["success"] = True
run_data["duration_seconds"] = round(elapsed, 3)
run_data["http_status"] = 200
run_data["finish_reason"] = parsed["finish_reason"] or "none"
run_data["has_tool_calls"] = parsed["has_tool_calls"]
run_data["tool_calls"] = parsed["tool_calls"]
run_data["content"] = parsed["content"]
run_data["usage"] = parsed["usage"]
# Check if all tool call args are valid JSON
all_valid = all(tc.get("args_valid_json", False) for tc in parsed["tool_calls"]) if parsed["tool_calls"] else True
run_data["args_valid_json"] = all_valid
successful_times.append(elapsed)
# Track tool call distribution
fr = parsed["finish_reason"] or "none"
finish_reasons[fr] = finish_reasons.get(fr, 0) + 1
for tc in parsed["tool_calls"]:
tn = tc["name"]
tool_call_counts[tn] = tool_call_counts.get(tn, 0) + 1
# Display
tool_names = ", ".join(tc["name"] for tc in parsed["tool_calls"]) if parsed["tool_calls"] else "NONE"
json_ok = "" if all_valid else "✗ BAD JSON"
content_flag = " +content" if parsed["content"] else ""
print(f" ✅ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {str(fr):<12} | tools: {tool_names} | json: {json_ok}{content_flag}")
except requests.exceptions.HTTPError as e:
elapsed = time.time() - start
run_data["duration_seconds"] = round(elapsed, 3)
run_data["error"] = str(e)[:500]
run_data["error_type"] = "http_error"
status = None
try:
status = e.response.status_code if e.response is not None else None
except:
pass
run_data["http_status"] = status
try:
err_body = e.response.json() if e.response is not None else {}
run_data["error_body"] = err_body
run_data["error"] = json.dumps(err_body)[:500]
except:
run_data["error_body"] = {}
errors_list.append({"run": run_num, "type": "http_error", "status": status, "error": run_data["error"][:200]})
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | HTTP {status or "???"} - {run_data['error'][:100]}")
except requests.exceptions.Timeout as e:
elapsed = time.time() - start
run_data["duration_seconds"] = round(elapsed, 3)
run_data["error"] = f"Request timed out after {timeout}s"
run_data["error_type"] = "timeout"
errors_list.append({"run": run_num, "type": "timeout", "error": run_data["error"]})
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | TIMEOUT")
except requests.exceptions.ConnectionError as e:
elapsed = time.time() - start
run_data["duration_seconds"] = round(elapsed, 3)
run_data["error"] = str(e)[:500]
run_data["error_type"] = "connection_error"
errors_list.append({"run": run_num, "type": "connection_error", "error": str(e)[:200]})
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | CONNECTION ERROR - {str(e)[:80]}")
except json.JSONDecodeError as e:
elapsed = time.time() - start
run_data["duration_seconds"] = round(elapsed, 3)
run_data["error"] = f"Invalid JSON response: {str(e)[:200]}"
run_data["error_type"] = "json_decode_error"
errors_list.append({"run": run_num, "type": "json_decode_error", "error": str(e)[:200]})
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | JSON DECODE ERROR")
except Exception as e:
elapsed = time.time() - start
run_data["duration_seconds"] = round(elapsed, 3)
run_data["error"] = str(e)[:500]
run_data["error_type"] = type(e).__name__
errors_list.append({"run": run_num, "type": type(e).__name__, "error": str(e)[:200]})
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {type(e).__name__}: {str(e)[:80]}")
run_data["end_time"] = datetime.now().isoformat()
results["runs"].append(run_data)
# Save intermediate results
with open(f"{output_dir}/benchmark_results.json", "w") as f:
json.dump(results, f, indent=2)
# === COMPUTE STATS ===
successful_runs = [r for r in results["runs"] if r["success"]]
failed_runs = [r for r in results["runs"] if not r["success"]]
tool_call_runs = [r for r in successful_runs if r["has_tool_calls"]]
no_tool_runs = [r for r in successful_runs if not r["has_tool_calls"]]
bad_json_runs = [r for r in successful_runs if not r["args_valid_json"]]
content_runs = [r for r in successful_runs if r["content"]]
stats = {
"total_runs": num_runs,
"successful_runs": len(successful_runs),
"failed_runs": len(failed_runs),
"success_rate": round(len(successful_runs) / num_runs * 100, 1),
"tool_call_runs": len(tool_call_runs),
"tool_call_rate": round(len(tool_call_runs) / len(successful_runs) * 100, 1) if successful_runs else 0,
"no_tool_runs": len(no_tool_runs),
"bad_json_runs": len(bad_json_runs),
"json_validity_rate": round((len(tool_call_runs) - len(bad_json_runs)) / len(tool_call_runs) * 100, 1) if tool_call_runs else 0,
"content_with_tool_calls": len([r for r in tool_call_runs if r["content"]]),
"tool_call_distribution": tool_call_counts,
"finish_reasons": finish_reasons,
"errors": errors_list,
}
if successful_times:
stats["timing"] = {
"avg": round(statistics.mean(successful_times), 3),
"median": round(statistics.median(successful_times), 3),
"min": round(min(successful_times), 3),
"max": round(max(successful_times), 3),
"stdev": round(statistics.stdev(successful_times), 3) if len(successful_times) > 1 else 0,
"p90": round(sorted(successful_times)[int(len(successful_times) * 0.9)], 3) if len(successful_times) >= 10 else None,
"p95": round(sorted(successful_times)[int(len(successful_times) * 0.95)], 3) if len(successful_times) >= 20 else None,
"p99": round(sorted(successful_times)[int(len(successful_times) * 0.99)], 3) if len(successful_times) >= 100 else None,
}
# Usage stats
if successful_runs:
prompt_tokens = [r["usage"].get("prompt_tokens", 0) for r in successful_runs if r["usage"]]
completion_tokens = [r["usage"].get("completion_tokens", 0) for r in successful_runs if r["usage"]]
total_tokens = [r["usage"].get("total_tokens", 0) for r in successful_runs if r["usage"]]
if prompt_tokens:
stats["token_usage"] = {
"avg_prompt_tokens": round(statistics.mean(prompt_tokens)),
"avg_completion_tokens": round(statistics.mean(completion_tokens)),
"avg_total_tokens": round(statistics.mean(total_tokens)),
"total_prompt_tokens": sum(prompt_tokens),
"total_completion_tokens": sum(completion_tokens),
"total_all_tokens": sum(total_tokens),
}
results["stats"] = stats
results["metadata"]["end_time"] = datetime.now().isoformat()
# Save final results
with open(f"{output_dir}/benchmark_results.json", "w") as f:
json.dump(results, f, indent=2)
# === PRINT SUMMARY ===
print(f"\n{'='*70}")
print(f"BENCHMARK COMPLETE — {model}")
print(f"{'='*70}")
print(f"\n📊 Results Summary:")
print(f" Total runs: {num_runs}")
print(f" Successful: {stats['successful_runs']} ({stats['success_rate']}%)")
print(f" Failed: {stats['failed_runs']}")
print(f" Tool call rate: {stats['tool_call_rate']}% of successful runs")
print(f" JSON validity: {stats['json_validity_rate']}% of tool calls")
print(f" Bad JSON args: {stats['bad_json_runs']}")
print(f" Content + tool call: {stats['content_with_tool_calls']} (ideally 0)")
if "timing" in stats:
t = stats["timing"]
print(f"\n⏱️ Timing:")
print(f" Average: {t['avg']}s")
print(f" Median: {t['median']}s")
print(f" Min: {t['min']}s")
print(f" Max: {t['max']}s")
print(f" Std Dev: {t['stdev']}s")
if t.get("p90"): print(f" P90: {t['p90']}s")
if t.get("p95"): print(f" P95: {t['p95']}s")
if tool_call_counts:
print(f"\n🔧 Tool Call Distribution:")
for tn, count in sorted(tool_call_counts.items(), key=lambda x: x[1], reverse=True):
pct = round(count / sum(tool_call_counts.values()) * 100, 1)
bar = "" * int(pct / 2)
print(f" {tn:<35} {count:3d} ({pct:5.1f}%) {bar}")
if finish_reasons:
print(f"\n🏁 Finish Reasons:")
for fr, count in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True):
print(f" {str(fr or "none"):<20} {count:3d}")
if errors_list:
print(f"\n⚠️ Errors ({len(errors_list)}):")
# Group by type
error_types = {}
for e in errors_list:
et = e["type"]
error_types[et] = error_types.get(et, 0) + 1
for et, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
print(f" {et}: {count}")
# Show first 5 unique errors
seen = set()
for e in errors_list:
key = e["error"][:100]
if key not in seen:
seen.add(key)
print(f" Run {e['run']}: [{e['type']}] {e['error'][:150]}")
if len(seen) >= 5:
break
if "token_usage" in stats:
tu = stats["token_usage"]
print(f"\n🪙 Token Usage:")
print(f" Avg prompt: {tu['avg_prompt_tokens']}")
print(f" Avg completion: {tu['avg_completion_tokens']}")
print(f" Avg total: {tu['avg_total_tokens']}")
print(f" Grand total: {tu['total_all_tokens']}")
print(f"\n📁 Results: {output_dir}/benchmark_results.json")
return results
def generate_infographic(output_dir, api_key):
"""Generate a 4K infographic from benchmark results."""
with open(f"{output_dir}/benchmark_results.json") as f:
data = json.load(f)
stats = data["stats"]
meta = data["metadata"]
timing = stats.get("timing", {})
tool_dist = stats.get("tool_call_distribution", {})
token_usage = stats.get("token_usage", {})
errors = stats.get("errors", [])
finish_reasons = stats.get("finish_reasons", {})
# Build tool distribution text
tool_lines = []
if tool_dist:
total_calls = sum(tool_dist.values())
for tn, count in sorted(tool_dist.items(), key=lambda x: x[1], reverse=True):
pct = round(count / total_calls * 100, 1)
tool_lines.append(f"{tn}: {count} calls ({pct}%)")
tool_text = ", ".join(tool_lines) if tool_lines else "No tool calls"
# Finish reasons text
fr_text = ", ".join(f"{k}: {v}" for k, v in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True))
# Error summary
error_types = {}
for e in errors:
error_types[e["type"]] = error_types.get(e["type"], 0) + 1
error_text = ", ".join(f"{k}: {v}" for k, v in error_types.items()) if error_types else "No errors"
prompt = f"""Premium dark-themed data infographic titled 'VENICE AI CHAT BENCHMARK' with subtitle 'Tool Choice Stress Test — {meta["model"]}{stats["total_runs"]} Runs — {meta.get("start_time","")[:10]}'. Sleek modern design with dark navy-black background, neon green and electric cyan accent colors, glowing AI circuit patterns.
Layout: TOP SECTION: Large glowing title banner with AI brain icon. Key stats row: '{stats["total_runs"]} Total Runs' '{stats["success_rate"]}% Success Rate' '{stats["tool_call_rate"]}% Tool Call Rate' '{stats["json_validity_rate"]}% JSON Valid' '{len(meta.get("tool_names",[]))} Tools Defined'.
MIDDLE LEFT: Performance gauge showing Average Response Time {timing.get("avg","N/A")}s, Median {timing.get("median","N/A")}s, Min {timing.get("min","N/A")}s, Max {timing.get("max","N/A")}s, StdDev {timing.get("stdev","N/A")}s, P90 {timing.get("p90","N/A")}s.
MIDDLE RIGHT: Horizontal bar chart of Tool Call Distribution: {tool_text}. Bars in gradient neon colors.
BOTTOM LEFT: Reliability metrics: {stats["successful_runs"]} successful, {stats["failed_runs"]} failed, {stats["bad_json_runs"]} bad JSON responses, {stats["content_with_tool_calls"]} responses had content alongside tool calls. Finish reasons: {fr_text}.
BOTTOM CENTER: Token usage stats: Avg prompt {token_usage.get("avg_prompt_tokens","N/A")} tokens, Avg completion {token_usage.get("avg_completion_tokens","N/A")} tokens, Total {token_usage.get("total_all_tokens","N/A")} tokens across all runs.
BOTTOM RIGHT: Error breakdown: {error_text}.
All text crisp and legible, professional data dashboard style, glowing neon data points, subtle encryption circuit patterns in background. Model name '{meta["model"]}' prominently displayed."""
print(f"\n🎨 Generating 4K infographic...")
img_output = f"{output_dir}/benchmark_infographic"
cmd = [
"python", "~/.jae/agent/skills/venice-image-gen/scripts/generate_image.py",
prompt,
"--resolution", "4K",
"--aspect_ratio", "16:9",
"--format", "png",
"--output", img_output
]
env = os.environ.copy()
env["VENICE_API_KEY"] = api_key
result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=120)
print(result.stdout)
if result.stderr:
print(result.stderr)
if result.returncode == 0:
print(f"✅ Infographic saved to: {img_output}.png")
else:
print(f"❌ Infographic generation failed (exit code {result.returncode})")
return result.returncode == 0
def main():
parser = argparse.ArgumentParser(description="Venice Chat Model Benchmark")
parser.add_argument("--model", default="minimax-m27", help="Model ID to test")
parser.add_argument("--runs", type=int, default=50, help="Number of runs")
parser.add_argument("--timeout", type=int, default=120, help="Request timeout in seconds")
parser.add_argument("--output", default="~/chat_benchmark", help="Output directory")
parser.add_argument("--infographic", action="store_true", help="Generate 4K infographic")
args = parser.parse_args()
api_key = os.environ.get("VENICE_API_KEY", "")
if not api_key:
print("ERROR: VENICE_API_KEY environment variable not set")
sys.exit(1)
results = run_benchmark(api_key, args.model, args.runs, args.output, args.timeout)
if args.infographic:
generate_infographic(args.output, api_key)
if __name__ == "__main__":
main()