Skills included: - venice-chat: Chat with Venice LLM models, vision, reasoning - venice-chat-benchmark: Benchmark chat models with infographics - venice-image-gen: Generate images via Venice API - venice-list-image-models: List available image models - venice-list-text-models: List available text models - venice-list-video-models: List available video models - venice-tts: Text-to-speech via Venice API - venice-video-generate: Generate videos from text/images - venice-video-queue: Queue video generation jobs - venice-video-quote: Get video generation cost quotes - venice-video-retrieve: Retrieve completed videos All rebranded from Agent Zero paths to Agent JAE (~/.jae/agent/skills/). Requires VENICE_API_KEY environment variable.
618 lines
25 KiB
Python
618 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""Venice Chat Model Benchmark - Tests chat completions with tool_choice.
|
|
|
|
Usage:
|
|
python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir
|
|
python benchmark.py --model minimax-m27 --runs 50 --output /path/to/output_dir --infographic
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import statistics
|
|
from datetime import datetime
|
|
|
|
import requests
|
|
|
|
API_URL = "https://api.venice.ai/api/v1/chat/completions"
|
|
|
|
# === COMPLEX TOOL_CHOICE PAYLOAD (Travel Planning) ===
|
|
|
|
SYSTEM_PROMPT = """You are an expert travel planning assistant. You MUST call exactly ONE tool on every response. Never respond with plain text. Your response IS the tool call.
|
|
|
|
Available tools:
|
|
- set_travel_dates: Record travel dates
|
|
- set_secondary_destinations: Record destinations
|
|
- set_traveler_info: Record traveler details
|
|
- set_travel_priorities: Record priorities
|
|
- set_budget: Record budget
|
|
- present_choices: Show clickable choices
|
|
- suggest_primary_destinations: Show destination cards
|
|
|
|
Collect dates first, then travelers, then destinations. Pre-fill from conversation context.
|
|
|
|
Current itinerary context:
|
|
No itinerary data yet."""
|
|
|
|
USER_MESSAGE = "My wife and I want to plan a 2-week trip to Japan this October. We love food, temples, and hiking. Mid-range budget around $6000."
|
|
|
|
TOOLS = [
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "set_travel_dates",
|
|
"description": "Set the travel dates for the trip. Opens an interactive date picker.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"start_date": {"type": "string", "description": "Trip start date YYYY-MM-DD"},
|
|
"end_date": {"type": "string", "description": "Trip end date YYYY-MM-DD"},
|
|
"flexible": {"type": "boolean", "description": "Whether dates are flexible"}
|
|
},
|
|
"required": ["start_date", "end_date"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "set_secondary_destinations",
|
|
"description": "Set trip destinations with secondary options.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"description": {"type": "string", "description": "Overview of why these destinations fit"},
|
|
"primary": {"type": "string", "description": "Primary destination"},
|
|
"secondary": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {"type": "string"},
|
|
"transit": {"type": "string"}
|
|
},
|
|
"required": ["name", "transit"]
|
|
},
|
|
"description": "4-5 nearby destinations"
|
|
}
|
|
},
|
|
"required": ["description", "primary", "secondary"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "set_traveler_info",
|
|
"description": "Capture traveler information.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"description": {"type": "string", "description": "Trip vibe and goals"},
|
|
"count": {"type": "integer", "description": "Number of travelers"},
|
|
"interests": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Interest IDs: adventure, hiking, culture, food, street_food, fine_dining, nature, romantic, etc."
|
|
}
|
|
},
|
|
"required": ["count"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "set_travel_priorities",
|
|
"description": "Set what matters most for this trip.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"ranked": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Priorities in order: comfort, budget, adventure, culture, food, nature, romantic"
|
|
}
|
|
},
|
|
"required": ["ranked"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "set_budget",
|
|
"description": "Set the trip budget.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"total": {"type": "number", "description": "Total budget"},
|
|
"currency": {"type": "string", "description": "Currency code"}
|
|
},
|
|
"required": ["total", "currency"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "present_choices",
|
|
"description": "Present clickable choices to the user.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"message": {"type": "string", "description": "Question to display"},
|
|
"choices": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"label": {"type": "string"},
|
|
"description": {"type": "string"}
|
|
},
|
|
"required": ["label"]
|
|
}
|
|
}
|
|
},
|
|
"required": ["message", "choices"]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"type": "function",
|
|
"function": {
|
|
"name": "suggest_primary_destinations",
|
|
"description": "Present rich destination suggestions.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"message": {"type": "string", "description": "Heading above cards"},
|
|
"destinations": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {"type": "string"},
|
|
"tagline": {"type": "string"}
|
|
},
|
|
"required": ["name", "tagline"]
|
|
}
|
|
}
|
|
},
|
|
"required": ["message", "destinations"]
|
|
}
|
|
}
|
|
}
|
|
]
|
|
|
|
|
|
def make_request(api_key, model, timeout=120):
|
|
"""Make a single chat completion request with tools."""
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": USER_MESSAGE}
|
|
],
|
|
"tools": TOOLS,
|
|
"tool_choice": "auto",
|
|
"temperature": 0.7,
|
|
"stream": False
|
|
}
|
|
|
|
resp = requests.post(API_URL, headers=headers, json=payload, timeout=timeout)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def parse_response(data):
|
|
"""Parse the API response and extract key info."""
|
|
choice = data.get("choices", [{}])[0]
|
|
msg = choice.get("message", {})
|
|
finish_reason = choice.get("finish_reason") or "unknown"
|
|
usage = data.get("usage", {})
|
|
|
|
result = {
|
|
"finish_reason": finish_reason,
|
|
"has_tool_calls": bool(msg.get("tool_calls")),
|
|
"tool_calls": [],
|
|
"content": msg.get("content"),
|
|
"usage": usage,
|
|
}
|
|
|
|
if msg.get("tool_calls"):
|
|
for tc in msg["tool_calls"]:
|
|
tool_info = {
|
|
"id": tc.get("id", ""),
|
|
"name": tc["function"]["name"],
|
|
"arguments_raw": tc["function"]["arguments"],
|
|
}
|
|
try:
|
|
tool_info["arguments_parsed"] = json.loads(tc["function"]["arguments"])
|
|
tool_info["args_valid_json"] = True
|
|
except (json.JSONDecodeError, TypeError) as e:
|
|
tool_info["arguments_parsed"] = None
|
|
tool_info["args_valid_json"] = False
|
|
tool_info["json_error"] = str(e)
|
|
result["tool_calls"].append(tool_info)
|
|
|
|
return result
|
|
|
|
|
|
def run_benchmark(api_key, model, num_runs, output_dir, timeout=120):
|
|
"""Run the full benchmark."""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
print(f"{'='*70}")
|
|
print(f"VENICE CHAT BENCHMARK — Tool Choice Stress Test")
|
|
print(f"{'='*70}")
|
|
print(f"Model: {model}")
|
|
print(f"Runs: {num_runs}")
|
|
print(f"Timeout: {timeout}s per request")
|
|
print(f"Tools: {len(TOOLS)} tools defined")
|
|
print(f"Tool choice: auto")
|
|
print(f"Started: {datetime.now().isoformat()}")
|
|
print(f"{'='*70}\n")
|
|
|
|
results = {
|
|
"metadata": {
|
|
"model": model,
|
|
"num_runs": num_runs,
|
|
"timeout": timeout,
|
|
"num_tools": len(TOOLS),
|
|
"tool_names": [t["function"]["name"] for t in TOOLS],
|
|
"tool_choice": "auto",
|
|
"system_prompt": SYSTEM_PROMPT,
|
|
"user_message": USER_MESSAGE,
|
|
"start_time": datetime.now().isoformat(),
|
|
},
|
|
"runs": [],
|
|
"stats": {},
|
|
}
|
|
|
|
successful_times = []
|
|
tool_call_counts = {} # which tools get called
|
|
finish_reasons = {}
|
|
errors_list = []
|
|
|
|
for run_num in range(1, num_runs + 1):
|
|
run_data = {
|
|
"run": run_num,
|
|
"start_time": datetime.now().isoformat(),
|
|
"success": False,
|
|
"duration_seconds": None,
|
|
"error": None,
|
|
"error_type": None,
|
|
"http_status": None,
|
|
"finish_reason": None,
|
|
"has_tool_calls": False,
|
|
"tool_calls": [],
|
|
"content": None,
|
|
"usage": {},
|
|
"args_valid_json": True,
|
|
}
|
|
|
|
try:
|
|
start = time.time()
|
|
raw_response = make_request(api_key, model, timeout=timeout)
|
|
elapsed = time.time() - start
|
|
|
|
parsed = parse_response(raw_response)
|
|
|
|
run_data["success"] = True
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["http_status"] = 200
|
|
run_data["finish_reason"] = parsed["finish_reason"] or "none"
|
|
run_data["has_tool_calls"] = parsed["has_tool_calls"]
|
|
run_data["tool_calls"] = parsed["tool_calls"]
|
|
run_data["content"] = parsed["content"]
|
|
run_data["usage"] = parsed["usage"]
|
|
|
|
# Check if all tool call args are valid JSON
|
|
all_valid = all(tc.get("args_valid_json", False) for tc in parsed["tool_calls"]) if parsed["tool_calls"] else True
|
|
run_data["args_valid_json"] = all_valid
|
|
|
|
successful_times.append(elapsed)
|
|
|
|
# Track tool call distribution
|
|
fr = parsed["finish_reason"] or "none"
|
|
finish_reasons[fr] = finish_reasons.get(fr, 0) + 1
|
|
|
|
for tc in parsed["tool_calls"]:
|
|
tn = tc["name"]
|
|
tool_call_counts[tn] = tool_call_counts.get(tn, 0) + 1
|
|
|
|
# Display
|
|
tool_names = ", ".join(tc["name"] for tc in parsed["tool_calls"]) if parsed["tool_calls"] else "NONE"
|
|
json_ok = "✓" if all_valid else "✗ BAD JSON"
|
|
content_flag = " +content" if parsed["content"] else ""
|
|
print(f" ✅ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {str(fr):<12} | tools: {tool_names} | json: {json_ok}{content_flag}")
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
elapsed = time.time() - start
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["error"] = str(e)[:500]
|
|
run_data["error_type"] = "http_error"
|
|
status = None
|
|
try:
|
|
status = e.response.status_code if e.response is not None else None
|
|
except:
|
|
pass
|
|
run_data["http_status"] = status
|
|
try:
|
|
err_body = e.response.json() if e.response is not None else {}
|
|
run_data["error_body"] = err_body
|
|
run_data["error"] = json.dumps(err_body)[:500]
|
|
except:
|
|
run_data["error_body"] = {}
|
|
errors_list.append({"run": run_num, "type": "http_error", "status": status, "error": run_data["error"][:200]})
|
|
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | HTTP {status or "???"} - {run_data['error'][:100]}")
|
|
|
|
except requests.exceptions.Timeout as e:
|
|
elapsed = time.time() - start
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["error"] = f"Request timed out after {timeout}s"
|
|
run_data["error_type"] = "timeout"
|
|
errors_list.append({"run": run_num, "type": "timeout", "error": run_data["error"]})
|
|
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | TIMEOUT")
|
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
elapsed = time.time() - start
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["error"] = str(e)[:500]
|
|
run_data["error_type"] = "connection_error"
|
|
errors_list.append({"run": run_num, "type": "connection_error", "error": str(e)[:200]})
|
|
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | CONNECTION ERROR - {str(e)[:80]}")
|
|
|
|
except json.JSONDecodeError as e:
|
|
elapsed = time.time() - start
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["error"] = f"Invalid JSON response: {str(e)[:200]}"
|
|
run_data["error_type"] = "json_decode_error"
|
|
errors_list.append({"run": run_num, "type": "json_decode_error", "error": str(e)[:200]})
|
|
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | JSON DECODE ERROR")
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - start
|
|
run_data["duration_seconds"] = round(elapsed, 3)
|
|
run_data["error"] = str(e)[:500]
|
|
run_data["error_type"] = type(e).__name__
|
|
errors_list.append({"run": run_num, "type": type(e).__name__, "error": str(e)[:200]})
|
|
print(f" ❌ Run {run_num:3d}/{num_runs}: {elapsed:6.2f}s | {type(e).__name__}: {str(e)[:80]}")
|
|
|
|
run_data["end_time"] = datetime.now().isoformat()
|
|
results["runs"].append(run_data)
|
|
|
|
# Save intermediate results
|
|
with open(f"{output_dir}/benchmark_results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
# === COMPUTE STATS ===
|
|
successful_runs = [r for r in results["runs"] if r["success"]]
|
|
failed_runs = [r for r in results["runs"] if not r["success"]]
|
|
tool_call_runs = [r for r in successful_runs if r["has_tool_calls"]]
|
|
no_tool_runs = [r for r in successful_runs if not r["has_tool_calls"]]
|
|
bad_json_runs = [r for r in successful_runs if not r["args_valid_json"]]
|
|
content_runs = [r for r in successful_runs if r["content"]]
|
|
|
|
stats = {
|
|
"total_runs": num_runs,
|
|
"successful_runs": len(successful_runs),
|
|
"failed_runs": len(failed_runs),
|
|
"success_rate": round(len(successful_runs) / num_runs * 100, 1),
|
|
"tool_call_runs": len(tool_call_runs),
|
|
"tool_call_rate": round(len(tool_call_runs) / len(successful_runs) * 100, 1) if successful_runs else 0,
|
|
"no_tool_runs": len(no_tool_runs),
|
|
"bad_json_runs": len(bad_json_runs),
|
|
"json_validity_rate": round((len(tool_call_runs) - len(bad_json_runs)) / len(tool_call_runs) * 100, 1) if tool_call_runs else 0,
|
|
"content_with_tool_calls": len([r for r in tool_call_runs if r["content"]]),
|
|
"tool_call_distribution": tool_call_counts,
|
|
"finish_reasons": finish_reasons,
|
|
"errors": errors_list,
|
|
}
|
|
|
|
if successful_times:
|
|
stats["timing"] = {
|
|
"avg": round(statistics.mean(successful_times), 3),
|
|
"median": round(statistics.median(successful_times), 3),
|
|
"min": round(min(successful_times), 3),
|
|
"max": round(max(successful_times), 3),
|
|
"stdev": round(statistics.stdev(successful_times), 3) if len(successful_times) > 1 else 0,
|
|
"p90": round(sorted(successful_times)[int(len(successful_times) * 0.9)], 3) if len(successful_times) >= 10 else None,
|
|
"p95": round(sorted(successful_times)[int(len(successful_times) * 0.95)], 3) if len(successful_times) >= 20 else None,
|
|
"p99": round(sorted(successful_times)[int(len(successful_times) * 0.99)], 3) if len(successful_times) >= 100 else None,
|
|
}
|
|
|
|
# Usage stats
|
|
if successful_runs:
|
|
prompt_tokens = [r["usage"].get("prompt_tokens", 0) for r in successful_runs if r["usage"]]
|
|
completion_tokens = [r["usage"].get("completion_tokens", 0) for r in successful_runs if r["usage"]]
|
|
total_tokens = [r["usage"].get("total_tokens", 0) for r in successful_runs if r["usage"]]
|
|
if prompt_tokens:
|
|
stats["token_usage"] = {
|
|
"avg_prompt_tokens": round(statistics.mean(prompt_tokens)),
|
|
"avg_completion_tokens": round(statistics.mean(completion_tokens)),
|
|
"avg_total_tokens": round(statistics.mean(total_tokens)),
|
|
"total_prompt_tokens": sum(prompt_tokens),
|
|
"total_completion_tokens": sum(completion_tokens),
|
|
"total_all_tokens": sum(total_tokens),
|
|
}
|
|
|
|
results["stats"] = stats
|
|
results["metadata"]["end_time"] = datetime.now().isoformat()
|
|
|
|
# Save final results
|
|
with open(f"{output_dir}/benchmark_results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
# === PRINT SUMMARY ===
|
|
print(f"\n{'='*70}")
|
|
print(f"BENCHMARK COMPLETE — {model}")
|
|
print(f"{'='*70}")
|
|
print(f"\n📊 Results Summary:")
|
|
print(f" Total runs: {num_runs}")
|
|
print(f" Successful: {stats['successful_runs']} ({stats['success_rate']}%)")
|
|
print(f" Failed: {stats['failed_runs']}")
|
|
print(f" Tool call rate: {stats['tool_call_rate']}% of successful runs")
|
|
print(f" JSON validity: {stats['json_validity_rate']}% of tool calls")
|
|
print(f" Bad JSON args: {stats['bad_json_runs']}")
|
|
print(f" Content + tool call: {stats['content_with_tool_calls']} (ideally 0)")
|
|
|
|
if "timing" in stats:
|
|
t = stats["timing"]
|
|
print(f"\n⏱️ Timing:")
|
|
print(f" Average: {t['avg']}s")
|
|
print(f" Median: {t['median']}s")
|
|
print(f" Min: {t['min']}s")
|
|
print(f" Max: {t['max']}s")
|
|
print(f" Std Dev: {t['stdev']}s")
|
|
if t.get("p90"): print(f" P90: {t['p90']}s")
|
|
if t.get("p95"): print(f" P95: {t['p95']}s")
|
|
|
|
if tool_call_counts:
|
|
print(f"\n🔧 Tool Call Distribution:")
|
|
for tn, count in sorted(tool_call_counts.items(), key=lambda x: x[1], reverse=True):
|
|
pct = round(count / sum(tool_call_counts.values()) * 100, 1)
|
|
bar = "█" * int(pct / 2)
|
|
print(f" {tn:<35} {count:3d} ({pct:5.1f}%) {bar}")
|
|
|
|
if finish_reasons:
|
|
print(f"\n🏁 Finish Reasons:")
|
|
for fr, count in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {str(fr or "none"):<20} {count:3d}")
|
|
|
|
if errors_list:
|
|
print(f"\n⚠️ Errors ({len(errors_list)}):")
|
|
# Group by type
|
|
error_types = {}
|
|
for e in errors_list:
|
|
et = e["type"]
|
|
error_types[et] = error_types.get(et, 0) + 1
|
|
for et, count in sorted(error_types.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {et}: {count}")
|
|
# Show first 5 unique errors
|
|
seen = set()
|
|
for e in errors_list:
|
|
key = e["error"][:100]
|
|
if key not in seen:
|
|
seen.add(key)
|
|
print(f" Run {e['run']}: [{e['type']}] {e['error'][:150]}")
|
|
if len(seen) >= 5:
|
|
break
|
|
|
|
if "token_usage" in stats:
|
|
tu = stats["token_usage"]
|
|
print(f"\n🪙 Token Usage:")
|
|
print(f" Avg prompt: {tu['avg_prompt_tokens']}")
|
|
print(f" Avg completion: {tu['avg_completion_tokens']}")
|
|
print(f" Avg total: {tu['avg_total_tokens']}")
|
|
print(f" Grand total: {tu['total_all_tokens']}")
|
|
|
|
print(f"\n📁 Results: {output_dir}/benchmark_results.json")
|
|
return results
|
|
|
|
|
|
def generate_infographic(output_dir, api_key):
|
|
"""Generate a 4K infographic from benchmark results."""
|
|
with open(f"{output_dir}/benchmark_results.json") as f:
|
|
data = json.load(f)
|
|
|
|
stats = data["stats"]
|
|
meta = data["metadata"]
|
|
timing = stats.get("timing", {})
|
|
tool_dist = stats.get("tool_call_distribution", {})
|
|
token_usage = stats.get("token_usage", {})
|
|
errors = stats.get("errors", [])
|
|
finish_reasons = stats.get("finish_reasons", {})
|
|
|
|
# Build tool distribution text
|
|
tool_lines = []
|
|
if tool_dist:
|
|
total_calls = sum(tool_dist.values())
|
|
for tn, count in sorted(tool_dist.items(), key=lambda x: x[1], reverse=True):
|
|
pct = round(count / total_calls * 100, 1)
|
|
tool_lines.append(f"{tn}: {count} calls ({pct}%)")
|
|
tool_text = ", ".join(tool_lines) if tool_lines else "No tool calls"
|
|
|
|
# Finish reasons text
|
|
fr_text = ", ".join(f"{k}: {v}" for k, v in sorted(finish_reasons.items(), key=lambda x: x[1], reverse=True))
|
|
|
|
# Error summary
|
|
error_types = {}
|
|
for e in errors:
|
|
error_types[e["type"]] = error_types.get(e["type"], 0) + 1
|
|
error_text = ", ".join(f"{k}: {v}" for k, v in error_types.items()) if error_types else "No errors"
|
|
|
|
prompt = f"""Premium dark-themed data infographic titled 'VENICE AI CHAT BENCHMARK' with subtitle 'Tool Choice Stress Test — {meta["model"]} — {stats["total_runs"]} Runs — {meta.get("start_time","")[:10]}'. Sleek modern design with dark navy-black background, neon green and electric cyan accent colors, glowing AI circuit patterns.
|
|
|
|
Layout: TOP SECTION: Large glowing title banner with AI brain icon. Key stats row: '{stats["total_runs"]} Total Runs' '{stats["success_rate"]}% Success Rate' '{stats["tool_call_rate"]}% Tool Call Rate' '{stats["json_validity_rate"]}% JSON Valid' '{len(meta.get("tool_names",[]))} Tools Defined'.
|
|
|
|
MIDDLE LEFT: Performance gauge showing Average Response Time {timing.get("avg","N/A")}s, Median {timing.get("median","N/A")}s, Min {timing.get("min","N/A")}s, Max {timing.get("max","N/A")}s, StdDev {timing.get("stdev","N/A")}s, P90 {timing.get("p90","N/A")}s.
|
|
|
|
MIDDLE RIGHT: Horizontal bar chart of Tool Call Distribution: {tool_text}. Bars in gradient neon colors.
|
|
|
|
BOTTOM LEFT: Reliability metrics: {stats["successful_runs"]} successful, {stats["failed_runs"]} failed, {stats["bad_json_runs"]} bad JSON responses, {stats["content_with_tool_calls"]} responses had content alongside tool calls. Finish reasons: {fr_text}.
|
|
|
|
BOTTOM CENTER: Token usage stats: Avg prompt {token_usage.get("avg_prompt_tokens","N/A")} tokens, Avg completion {token_usage.get("avg_completion_tokens","N/A")} tokens, Total {token_usage.get("total_all_tokens","N/A")} tokens across all runs.
|
|
|
|
BOTTOM RIGHT: Error breakdown: {error_text}.
|
|
|
|
All text crisp and legible, professional data dashboard style, glowing neon data points, subtle encryption circuit patterns in background. Model name '{meta["model"]}' prominently displayed."""
|
|
|
|
print(f"\n🎨 Generating 4K infographic...")
|
|
img_output = f"{output_dir}/benchmark_infographic"
|
|
|
|
cmd = [
|
|
"python", "~/.jae/agent/skills/venice-image-gen/scripts/generate_image.py",
|
|
prompt,
|
|
"--resolution", "4K",
|
|
"--aspect_ratio", "16:9",
|
|
"--format", "png",
|
|
"--output", img_output
|
|
]
|
|
|
|
env = os.environ.copy()
|
|
env["VENICE_API_KEY"] = api_key
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=120)
|
|
print(result.stdout)
|
|
if result.stderr:
|
|
print(result.stderr)
|
|
|
|
if result.returncode == 0:
|
|
print(f"✅ Infographic saved to: {img_output}.png")
|
|
else:
|
|
print(f"❌ Infographic generation failed (exit code {result.returncode})")
|
|
|
|
return result.returncode == 0
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Venice Chat Model Benchmark")
|
|
parser.add_argument("--model", default="minimax-m27", help="Model ID to test")
|
|
parser.add_argument("--runs", type=int, default=50, help="Number of runs")
|
|
parser.add_argument("--timeout", type=int, default=120, help="Request timeout in seconds")
|
|
parser.add_argument("--output", default="~/chat_benchmark", help="Output directory")
|
|
parser.add_argument("--infographic", action="store_true", help="Generate 4K infographic")
|
|
args = parser.parse_args()
|
|
|
|
api_key = os.environ.get("VENICE_API_KEY", "")
|
|
if not api_key:
|
|
print("ERROR: VENICE_API_KEY environment variable not set")
|
|
sys.exit(1)
|
|
|
|
results = run_benchmark(api_key, args.model, args.runs, args.output, args.timeout)
|
|
|
|
if args.infographic:
|
|
generate_infographic(args.output, api_key)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|