FocusFlowAI / voice.py
avaliev's picture
Upload 9 files
58ca1d0 verified
"""
ElevenLabs Voice Integration for FocusFlow.
Provides optional voice feedback for focus agent and Pomodoro timer.
Gracefully falls back to text-only mode if API key is missing or quota exceeded.
"""
import os
import tempfile
from typing import Optional, Dict
from pathlib import Path
class VoiceGenerator:
"""
Handles text-to-speech generation using ElevenLabs API.
Designed for graceful degradation - never crashes if voice unavailable.
"""
def __init__(self):
"""Initialize ElevenLabs client if API key available."""
self.initialize()
def initialize(self):
"""Initialize or re-initialize the client."""
self.client = None
self.available = False
self.voice_id = "JBFqnCBsd6RMkjVDRZzb" # George - friendly, clear voice
self.model_id = "eleven_turbo_v2_5" # Fast, low-latency model
try:
# Check for API key (demo key first, then user key)
api_key = os.getenv("DEMO_ELEVEN_API_KEY") or os.getenv("ELEVEN_API_KEY")
if not api_key:
print("ℹ️ ElevenLabs: No API key found. Voice feedback disabled (text-only mode).")
return
# Try to initialize client
from elevenlabs.client import ElevenLabs
self.client = ElevenLabs(api_key=api_key)
self.available = True
key_type = "demo" if os.getenv("DEMO_ELEVEN_API_KEY") else "user"
print(f"✅ ElevenLabs voice initialized ({key_type} key)")
except ImportError:
print("⚠️ ElevenLabs: Package not installed. Run: pip install elevenlabs")
except Exception as e:
print(f"⚠️ ElevenLabs: Initialization failed: {e}")
def text_to_speech(self, text: str, emotion: str = "neutral") -> Optional[str]:
"""
Convert text to speech and return path to temporary audio file.
Args:
text: Text to convert to speech
emotion: Emotion hint (not used in current implementation)
Returns:
Path to temporary MP3 file, or None if voice unavailable
"""
# Check if voice is enabled globally
if os.getenv("VOICE_ENABLED", "true").lower() == "false":
return None
if not self.available or not self.client:
return None
try:
# Generate audio using ElevenLabs API
audio = self.client.text_to_speech.convert(
text=text,
voice_id=self.voice_id,
model_id=self.model_id,
output_format="mp3_44100_128"
)
# Convert generator/stream to bytes
audio_bytes = b"".join(audio)
# Save to temporary file (Gradio expects file path, not data URL)
temp_file = tempfile.NamedTemporaryFile(
delete=False,
suffix=".mp3",
prefix="focusflow_voice_"
)
temp_file.write(audio_bytes)
temp_file.close()
return temp_file.name
except Exception as e:
# Graceful degradation - log error but don't crash
print(f"⚠️ ElevenLabs: TTS failed: {e}")
return None
def get_focus_message_audio(self, verdict: str, message: str) -> Optional[str]:
"""
Generate voice feedback for focus check results.
Args:
verdict: "On Track", "Distracted", or "Idle"
message: Text message to speak
Returns:
Path to temporary audio file or None
"""
if not self.available:
return None
# Add emotion/tone based on verdict (for future voice modulation)
emotion_map = {
"On Track": "cheerful",
"Distracted": "concerned",
"Idle": "motivating"
}
emotion = emotion_map.get(verdict, "neutral")
return self.text_to_speech(message, emotion=emotion)
def get_pomodoro_audio(self, event_type: str) -> Optional[str]:
"""
Generate voice alerts for Pomodoro timer events.
Args:
event_type: "work_complete" or "break_complete"
Returns:
Path to temporary audio file or None
"""
if not self.available:
return None
messages = {
"work_complete": "Great work! Time for a 5-minute break. You've earned it!",
"break_complete": "Break's over! Let's get back to work and stay focused!"
}
message = messages.get(event_type, "Timer complete!")
return self.text_to_speech(message, emotion="cheerful")
def test_voice(self) -> Dict[str, any]:
"""
Test voice generation (for setup/debugging).
Returns:
Dict with status, message, and optional audio data
"""
if not self.available:
return {
"status": "unavailable",
"message": "Voice not available (no API key or initialization failed)",
"audio": None
}
try:
test_message = "Hello! FocusFlow voice is working perfectly!"
audio = self.text_to_speech(test_message)
if audio:
return {
"status": "success",
"message": "Voice test successful!",
"audio": audio
}
else:
return {
"status": "error",
"message": "Voice generation failed",
"audio": None
}
except Exception as e:
return {
"status": "error",
"message": f"Voice test failed: {str(e)}",
"audio": None
}
# Global voice generator instance
voice_generator = VoiceGenerator()
def get_voice_status() -> str:
"""
Get human-readable voice status for UI display.
Returns:
Status string like "✅ ElevenLabs Voice Enabled" or "ℹ️ Voice Disabled"
"""
if voice_generator.available:
return "✅ ElevenLabs Voice Enabled"
else:
return "ℹ️ Voice Disabled (text-only mode)"