Agent-JAE/default-skills/venice-tts/scripts/text_to_speech.py

"""# Venice.ai Text-to-Speech Instrument
Convert text to speech using Venice.ai TTS API.
Usage: text_to_speech(text, voice="af_sky", format="mp3", speed=1.0)

NOTE: Max input 4096 characters.
"""

import os
import sys
import argparse
import requests
from pathlib import Path
from datetime import datetime

# API Configuration
VENICE_API_URL = "https://api.venice.ai/api/v1/audio/speech"
VENICE_API_KEY = os.getenv("VENICE_API_KEY")

# Defaults
DEFAULT_MODEL = "tts-kokoro"  # Only option currently
DEFAULT_VOICE = "af_sky"  # American female
DEFAULT_FORMAT = "mp3"
DEFAULT_SPEED = 1.0

# All available voices
VOICES = [
    # American Female
    "af_alloy", "af_aoede", "af_bella", "af_heart", "af_jadzia", "af_jessica",
    "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky",
    # American Male
    "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael",
    "am_onyx", "am_puck", "am_santa",
    # British Female
    "bf_alice", "bf_emma", "bf_lily",
    # British Male
    "bm_daniel", "bm_fable", "bm_george", "bm_lewis",
    # Chinese Female
    "zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi",
    # Chinese Male
    "zm_yunjian", "zm_yunxi", "zm_yunxia", "zm_yunyang",
    # French Female
    "ff_siwis",
    # Hindi
    "hf_alpha", "hf_beta", "hm_omega", "hm_psi",
    # Italian
    "if_sara", "im_nicola",
    # Japanese
    "jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo",
    # Portuguese
    "pf_dora", "pm_alex", "pm_santa",
    # English (generic)
    "ef_dora", "em_alex", "em_santa",
]

FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]


def text_to_speech(
    text: str,
    voice: str = DEFAULT_VOICE,
    format: str = DEFAULT_FORMAT,
    speed: float = DEFAULT_SPEED,
    output_path: str = None,
) -> dict:
    """
    Convert text to speech using Venice.ai TTS.

    Args:
        text: Text to convert (max 4096 characters)
        voice: Voice ID (default: af_sky)
        format: mp3, opus, aac, flac, wav, pcm (default: mp3)
        speed: 0.25-4.0 (default: 1.0)
        output_path: Save path (auto-generated if not provided)

    Returns:
        dict with audio path and metadata
    """
    if not VENICE_API_KEY:
        raise ValueError("VENICE_API_KEY environment variable not set")

    if len(text) > 4096:
        raise ValueError(f"Text too long: {len(text)} chars (max 4096)")

    if voice not in VOICES:
        print(f"Warning: Unknown voice '{voice}', using {DEFAULT_VOICE}")
        voice = DEFAULT_VOICE

    if format not in FORMATS:
        print(f"Warning: Unknown format '{format}', using {DEFAULT_FORMAT}")
        format = DEFAULT_FORMAT

    if not (0.25 <= speed <= 4.0):
        print(f"Warning: Speed {speed} out of range, clamping to [0.25, 4.0]")
        speed = max(0.25, min(4.0, speed))

    headers = {
        "Authorization": f"Bearer {VENICE_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "input": text,
        "model": DEFAULT_MODEL,
        "voice": voice,
        "response_format": format,
        "speed": speed,
        "streaming": False,
    }

    print(f"Generating speech with voice '{voice}'...")
    print(f"Text: {text[:80]}{'...' if len(text) > 80 else ''}")

    response = requests.post(VENICE_API_URL, headers=headers, json=payload)
    response.raise_for_status()

    # Response is binary audio data
    audio_data = response.content

    # Determine output path
    if output_path:
        filepath = Path(output_path)
        if not filepath.suffix:
            filepath = Path(f"{output_path}.{format}")
    else:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filepath = Path(f"speech_{timestamp}.{format}")

    # Save audio
    filepath.write_bytes(audio_data)
    print(f"Saved: {filepath.absolute()}")

    return {
        "success": True,
        "voice": voice,
        "format": format,
        "speed": speed,
        "text_length": len(text),
        "audio_size": len(audio_data),
        "output": str(filepath.absolute()),
    }


def main():
    parser = argparse.ArgumentParser(description="Venice.ai Text-to-Speech")
    parser.add_argument("text", help="Text to convert (max 4096 chars)")
    parser.add_argument("--voice", "-v", default=DEFAULT_VOICE, help=f"Voice (default: {DEFAULT_VOICE})")
    parser.add_argument("--format", "-f", default=DEFAULT_FORMAT, choices=FORMATS, help="Audio format")
    parser.add_argument("--speed", "-s", type=float, default=DEFAULT_SPEED, help="Speed 0.25-4.0")
    parser.add_argument("--output", "-o", help="Output path")
    parser.add_argument("--list-voices", action="store_true", help="List all voices")

    args = parser.parse_args()

    if args.list_voices:
        print("Available voices:")
        for v in VOICES:
            print(f"  {v}")
        return

    result = text_to_speech(
        text=args.text,
        voice=args.voice,
        format=args.format,
        speed=args.speed,
        output_path=args.output,
    )

    if result["success"]:
        print(f"\nGenerated {result['audio_size']} bytes of audio")
    else:
        print(f"\nError: {result.get('error')}")
        sys.exit(1)


if __name__ == "__main__":
    main()