Agent-JAE/default-skills/venice-tts/scripts/text_to_speech.py
jae 7fe886fea5
Some checks are pending
CI / build-check-test (push) Waiting to run
feat: add 11 Venice AI skills as bundled defaults
Skills included:
- venice-chat: Chat with Venice LLM models, vision, reasoning
- venice-chat-benchmark: Benchmark chat models with infographics
- venice-image-gen: Generate images via Venice API
- venice-list-image-models: List available image models
- venice-list-text-models: List available text models
- venice-list-video-models: List available video models
- venice-tts: Text-to-speech via Venice API
- venice-video-generate: Generate videos from text/images
- venice-video-queue: Queue video generation jobs
- venice-video-quote: Get video generation cost quotes
- venice-video-retrieve: Retrieve completed videos

All rebranded from Agent Zero paths to Agent JAE (~/.jae/agent/skills/).
Requires VENICE_API_KEY environment variable.
2026-03-23 18:47:33 +01:00

176 lines
5.1 KiB
Python

"""# Venice.ai Text-to-Speech Instrument
Convert text to speech using Venice.ai TTS API.
Usage: text_to_speech(text, voice="af_sky", format="mp3", speed=1.0)
NOTE: Max input 4096 characters.
"""
import os
import sys
import argparse
import requests
from pathlib import Path
from datetime import datetime
# API Configuration
VENICE_API_URL = "https://api.venice.ai/api/v1/audio/speech"
VENICE_API_KEY = os.getenv("VENICE_API_KEY")
# Defaults
DEFAULT_MODEL = "tts-kokoro" # Only option currently
DEFAULT_VOICE = "af_sky" # American female
DEFAULT_FORMAT = "mp3"
DEFAULT_SPEED = 1.0
# All available voices
VOICES = [
# American Female
"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jadzia", "af_jessica",
"af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky",
# American Male
"am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael",
"am_onyx", "am_puck", "am_santa",
# British Female
"bf_alice", "bf_emma", "bf_lily",
# British Male
"bm_daniel", "bm_fable", "bm_george", "bm_lewis",
# Chinese Female
"zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi",
# Chinese Male
"zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang",
# French Female
"ff_siwis",
# Hindi
"hf_alpha", "hf_beta", "hm_omega", "hm_psi",
# Italian
"if_sara", "im_nicola",
# Japanese
"jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo",
# Portuguese
"pf_dora", "pm_alex", "pm_santa",
# English (generic)
"ef_dora", "em_alex", "em_santa",
]
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
def text_to_speech(
text: str,
voice: str = DEFAULT_VOICE,
format: str = DEFAULT_FORMAT,
speed: float = DEFAULT_SPEED,
output_path: str = None,
) -> dict:
"""
Convert text to speech using Venice.ai TTS.
Args:
text: Text to convert (max 4096 characters)
voice: Voice ID (default: af_sky)
format: mp3, opus, aac, flac, wav, pcm (default: mp3)
speed: 0.25-4.0 (default: 1.0)
output_path: Save path (auto-generated if not provided)
Returns:
dict with audio path and metadata
"""
if not VENICE_API_KEY:
raise ValueError("VENICE_API_KEY environment variable not set")
if len(text) > 4096:
raise ValueError(f"Text too long: {len(text)} chars (max 4096)")
if voice not in VOICES:
print(f"Warning: Unknown voice '{voice}', using {DEFAULT_VOICE}")
voice = DEFAULT_VOICE
if format not in FORMATS:
print(f"Warning: Unknown format '{format}', using {DEFAULT_FORMAT}")
format = DEFAULT_FORMAT
if not (0.25 <= speed <= 4.0):
print(f"Warning: Speed {speed} out of range, clamping to [0.25, 4.0]")
speed = max(0.25, min(4.0, speed))
headers = {
"Authorization": f"Bearer {VENICE_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"input": text,
"model": DEFAULT_MODEL,
"voice": voice,
"response_format": format,
"speed": speed,
"streaming": False,
}
print(f"Generating speech with voice '{voice}'...")
print(f"Text: {text[:80]}{'...' if len(text) > 80 else ''}")
response = requests.post(VENICE_API_URL, headers=headers, json=payload)
response.raise_for_status()
# Response is binary audio data
audio_data = response.content
# Determine output path
if output_path:
filepath = Path(output_path)
if not filepath.suffix:
filepath = Path(f"{output_path}.{format}")
else:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = Path(f"speech_{timestamp}.{format}")
# Save audio
filepath.write_bytes(audio_data)
print(f"Saved: {filepath.absolute()}")
return {
"success": True,
"voice": voice,
"format": format,
"speed": speed,
"text_length": len(text),
"audio_size": len(audio_data),
"output": str(filepath.absolute()),
}
def main():
parser = argparse.ArgumentParser(description="Venice.ai Text-to-Speech")
parser.add_argument("text", help="Text to convert (max 4096 chars)")
parser.add_argument("--voice", "-v", default=DEFAULT_VOICE, help=f"Voice (default: {DEFAULT_VOICE})")
parser.add_argument("--format", "-f", default=DEFAULT_FORMAT, choices=FORMATS, help="Audio format")
parser.add_argument("--speed", "-s", type=float, default=DEFAULT_SPEED, help="Speed 0.25-4.0")
parser.add_argument("--output", "-o", help="Output path")
parser.add_argument("--list-voices", action="store_true", help="List all voices")
args = parser.parse_args()
if args.list_voices:
print("Available voices:")
for v in VOICES:
print(f" {v}")
return
result = text_to_speech(
text=args.text,
voice=args.voice,
format=args.format,
speed=args.speed,
output_path=args.output,
)
if result["success"]:
print(f"\nGenerated {result['audio_size']} bytes of audio")
else:
print(f"\nError: {result.get('error')}")
sys.exit(1)
if __name__ == "__main__":
main()