DeepCritical / src /app.py
SeasonalFall84's picture
Add TTS on-demand with UI credentials, improve UI layout, and fix References removal
b4f9ff5
"""Main Gradio application for DeepCritical research agent.
This module provides the Gradio interface with:
- OAuth authentication via HuggingFace
- Multimodal input support (text, images, audio)
- Research agent orchestration
- Real-time event streaming
- MCP server integration
"""
import os
from collections.abc import AsyncGenerator
from typing import Any
import gradio as gr
import numpy as np
import structlog
from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
from src.orchestrator_factory import create_orchestrator
from src.services.multimodal_processing import get_multimodal_service
from src.utils.config import settings
from src.utils.models import AgentEvent, OrchestratorConfig
# Import ModelMessage from pydantic_ai with fallback
try:
from pydantic_ai import ModelMessage
except ImportError:
from typing import Any
ModelMessage = Any # type: ignore[assignment, misc]
# Type alias for Gradio multimodal input
MultimodalPostprocess = dict[str, Any] | str
# Import HuggingFace components with graceful fallback
try:
from pydantic_ai.models.huggingface import HuggingFaceModel
from pydantic_ai.providers.huggingface import HuggingFaceProvider
_HUGGINGFACE_AVAILABLE = True
except ImportError:
_HUGGINGFACE_AVAILABLE = False
HuggingFaceModel = None # type: ignore[assignment, misc]
HuggingFaceProvider = None # type: ignore[assignment, misc]
try:
from huggingface_hub import AsyncInferenceClient
_ASYNC_INFERENCE_AVAILABLE = True
except ImportError:
_ASYNC_INFERENCE_AVAILABLE = False
AsyncInferenceClient = None # type: ignore[assignment, misc]
logger = structlog.get_logger()
def configure_orchestrator(
use_mock: bool = False,
mode: str = "simple",
oauth_token: str | None = None,
hf_model: str | None = None,
hf_provider: str | None = None,
graph_mode: str | None = None,
use_graph: bool = True,
web_search_provider: str | None = None,
) -> tuple[Any, str]:
"""
Configure and create the research orchestrator.
Args:
use_mock: Force mock judge handler (for testing)
mode: Orchestrator mode ("simple", "iterative", "deep", "auto", "advanced")
oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
hf_model: Optional HuggingFace model ID (overrides settings)
hf_provider: Optional inference provider (currently not used by HuggingFaceProvider)
graph_mode: Optional graph execution mode
use_graph: Whether to use graph execution
web_search_provider: Optional web search provider ("auto", "serper", "duckduckgo")
Returns:
Tuple of (orchestrator, backend_info_string)
"""
from src.tools.search_handler import SearchHandler
from src.tools.web_search_factory import create_web_search_tool
# Create search handler with tools
tools = []
# Add web search tool
web_search_tool = create_web_search_tool(provider=web_search_provider or "auto")
if web_search_tool:
tools.append(web_search_tool)
logger.info("Web search tool added to search handler", provider=web_search_tool.name)
# Create config if not provided
config = OrchestratorConfig()
search_handler = SearchHandler(
tools=tools,
timeout=config.search_timeout,
include_rag=True,
auto_ingest_to_rag=True,
oauth_token=oauth_token,
)
# Create judge (mock, real, or free tier)
judge_handler: JudgeHandler | MockJudgeHandler | HFInferenceJudgeHandler
backend_info = "Unknown"
# 1. Forced Mock (Unit Testing)
if use_mock:
judge_handler = MockJudgeHandler()
backend_info = "Mock (Testing)"
# 2. API Key (OAuth or Env) - HuggingFace only (OAuth provides HF token)
# Priority: oauth_token > env vars
# On HuggingFace Spaces, OAuth token is available via request.oauth_token
#
# OAuth Scope Requirements:
# - 'inference-api': Required for HuggingFace Inference API access
# This scope grants access to:
# * HuggingFace's own Inference API
# * All third-party inference providers (nebius, together, scaleway, hyperbolic, novita, nscale, sambanova, ovh, fireworks, etc.)
# * All models available through the Inference Providers API
# See: https://huggingface.co/docs/hub/oauth#currently-supported-scopes
#
# Note: The hf_provider parameter is accepted but not used here because HuggingFaceProvider
# from pydantic-ai doesn't support provider selection. Provider selection happens at the
# InferenceClient level (used in HuggingFaceChatClient for advanced mode).
effective_api_key = oauth_token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
# Log which authentication source is being used
if effective_api_key:
auth_source = (
"OAuth token"
if oauth_token
else ("HF_TOKEN env var" if os.getenv("HF_TOKEN") else "HUGGINGFACE_API_KEY env var")
)
logger.info(
"Using HuggingFace authentication",
source=auth_source,
has_token=bool(effective_api_key),
)
if effective_api_key:
# We have an API key (OAuth or env) - use pydantic-ai with JudgeHandler
# This uses HuggingFace Inference API, which includes access to all third-party providers
# via the Inference Providers API (router.huggingface.co)
model: Any | None = None
# Use selected model or fall back to env var/settings
model_name = (
hf_model
or os.getenv("HF_MODEL")
or settings.huggingface_model
or "Qwen/Qwen3-Next-80B-A3B-Thinking"
)
if not _HUGGINGFACE_AVAILABLE:
raise ImportError(
"HuggingFace models are not available in this version of pydantic-ai. "
"Please install with: uv add 'pydantic-ai[huggingface]' to use HuggingFace inference providers."
)
# Inference API - uses HuggingFace Inference API
# Per https://ai.pydantic.dev/models/huggingface/#configure-the-provider
# HuggingFaceProvider accepts api_key parameter directly
# This is consistent with usage in src/utils/llm_factory.py and src/agent_factory/judges.py
# The OAuth token with 'inference-api' scope provides access to all inference providers
provider = HuggingFaceProvider(api_key=effective_api_key) # type: ignore[misc]
model = HuggingFaceModel(model_name, provider=provider) # type: ignore[misc]
backend_info = "API (HuggingFace OAuth)" if oauth_token else "API (Env Config)"
judge_handler = JudgeHandler(model=model)
# 3. Free Tier (HuggingFace Inference) - NO API KEY AVAILABLE
else:
# No API key available - use HFInferenceJudgeHandler with public models
# HFInferenceJudgeHandler will use HF_TOKEN from env if available, otherwise public models
# Note: OAuth token should have been caught in effective_api_key check above
# If we reach here, we truly have no API key, so use public models
judge_handler = HFInferenceJudgeHandler(
model_id=hf_model if hf_model else None,
api_key=None, # Will use HF_TOKEN from env if available, otherwise public models
)
model_display = hf_model.split("/")[-1] if hf_model else "Default (Public Models)"
backend_info = f"Free Tier ({model_display} - Public Models Only)"
# Determine effective mode
# If mode is already iterative/deep/auto, use it directly
# If mode is "graph" or "simple", use graph_mode if provided
effective_mode = mode
if mode in ("graph", "simple") and graph_mode:
effective_mode = graph_mode
elif mode == "graph" and not graph_mode:
effective_mode = "auto" # Default to auto if graph mode but no graph_mode specified
orchestrator = create_orchestrator(
search_handler=search_handler,
judge_handler=judge_handler,
config=config,
mode=effective_mode, # type: ignore
oauth_token=oauth_token,
)
return orchestrator, backend_info
def _is_file_path(text: str) -> bool:
"""Check if text appears to be a file path.
Args:
text: Text to check
Returns:
True if text looks like a file path
"""
return ("/" in text or "\\" in text) and (
"." in text.split("/")[-1] or "." in text.split("\\")[-1]
)
def event_to_chat_message(event: AgentEvent) -> dict[str, Any]:
"""Convert AgentEvent to Gradio chat message format.
Args:
event: AgentEvent to convert
Returns:
Dictionary with 'role' and 'content' keys for Gradio Chatbot
"""
result: dict[str, Any] = {
"role": "assistant",
"content": event.to_markdown(),
}
# Add metadata if available
if event.data:
metadata: dict[str, Any] = {}
# Extract file path if present
if isinstance(event.data, dict):
file_path = event.data.get("file_path")
if file_path:
metadata["file_path"] = file_path
if metadata:
result["metadata"] = metadata
return result
def extract_oauth_info(request: gr.Request | None) -> tuple[str | None, str | None]:
"""
Extract OAuth token and username from Gradio request.
Args:
request: Gradio request object containing OAuth information
Returns:
Tuple of (oauth_token, oauth_username)
"""
oauth_token: str | None = None
oauth_username: str | None = None
if request is None:
return oauth_token, oauth_username
# Try multiple ways to access OAuth token (Gradio API may vary)
# Pattern 1: request.oauth_token.token
if hasattr(request, "oauth_token") and request.oauth_token is not None:
if hasattr(request.oauth_token, "token"):
oauth_token = request.oauth_token.token
elif isinstance(request.oauth_token, str):
oauth_token = request.oauth_token
# Pattern 2: request.headers (fallback)
elif hasattr(request, "headers"):
# OAuth token might be in headers
auth_header = request.headers.get("authorization") or request.headers.get("Authorization")
if auth_header and auth_header.startswith("Bearer "):
oauth_token = auth_header.replace("Bearer ", "")
# Access username from request
if hasattr(request, "username") and request.username:
oauth_username = request.username
# Also try accessing via oauth_profile if available
elif hasattr(request, "oauth_profile") and request.oauth_profile is not None:
if hasattr(request.oauth_profile, "username") and request.oauth_profile.username:
oauth_username = request.oauth_profile.username
elif hasattr(request.oauth_profile, "name") and request.oauth_profile.name:
oauth_username = request.oauth_profile.name
return oauth_token, oauth_username
async def yield_auth_messages(
oauth_username: str | None,
oauth_token: str | None,
has_huggingface: bool,
mode: str,
) -> AsyncGenerator[dict[str, Any], None]:
"""
Yield authentication status messages.
Args:
oauth_username: OAuth username if available
oauth_token: OAuth token if available
has_huggingface: Whether HuggingFace authentication is available
mode: Research mode
Yields:
Chat message dictionaries
"""
if oauth_username:
yield {
"role": "assistant",
"content": f"πŸ‘‹ **Welcome, {oauth_username}!**\n\nAuthenticated via HuggingFace OAuth.",
}
if oauth_token:
yield {
"role": "assistant",
"content": (
"πŸ” **Authentication Status**: βœ… Authenticated\n\n"
"Your OAuth token has been validated. You can now use all AI models and research tools."
),
}
elif has_huggingface:
yield {
"role": "assistant",
"content": (
"πŸ” **Authentication Status**: βœ… Using environment token\n\n"
"Using HF_TOKEN from environment variables."
),
}
else:
yield {
"role": "assistant",
"content": (
"⚠️ **Authentication Status**: ❌ No authentication\n\n"
"Please sign in with HuggingFace or set HF_TOKEN environment variable."
),
}
yield {
"role": "assistant",
"content": f"πŸš€ **Mode**: {mode.upper()}\n\nStarting research agent...",
}
def _extract_oauth_token(oauth_token: gr.OAuthToken | None) -> str | None:
"""Extract token value from OAuth token object."""
if oauth_token is None:
return None
if hasattr(oauth_token, "token"):
token_value: str | None = getattr(oauth_token, "token", None) # type: ignore[assignment]
if token_value is None:
return None
logger.debug("OAuth token extracted from oauth_token.token attribute")
# Validate token format
from src.utils.hf_error_handler import log_token_info, validate_hf_token
log_token_info(token_value, context="research_agent")
is_valid, error_msg = validate_hf_token(token_value)
if not is_valid:
logger.warning(
"OAuth token validation failed",
error=error_msg,
oauth_token_type=type(oauth_token).__name__,
)
return token_value
if isinstance(oauth_token, str):
logger.debug("OAuth token extracted as string")
# Validate token format
from src.utils.hf_error_handler import log_token_info, validate_hf_token
log_token_info(oauth_token, context="research_agent")
return oauth_token
logger.warning(
"OAuth token object present but token extraction failed",
oauth_token_type=type(oauth_token).__name__,
)
return None
def _extract_username(oauth_profile: gr.OAuthProfile | None) -> str | None:
"""Extract username from OAuth profile."""
if oauth_profile is None:
return None
username: str | None = None
if hasattr(oauth_profile, "username") and oauth_profile.username:
username = str(oauth_profile.username)
elif hasattr(oauth_profile, "name") and oauth_profile.name:
username = str(oauth_profile.name)
if username:
logger.info("OAuth user authenticated", username=username)
return username
async def _process_multimodal_input(
message: str | MultimodalPostprocess,
enable_image_input: bool,
enable_audio_input: bool,
token_value: str | None,
) -> tuple[str, tuple[int, np.ndarray[Any, Any]] | None]: # type: ignore[type-arg]
"""Process multimodal input and return processed text and audio data."""
processed_text = ""
audio_input_data: tuple[int, np.ndarray[Any, Any]] | None = None # type: ignore[type-arg]
if isinstance(message, dict):
processed_text = message.get("text", "") or ""
files = message.get("files", []) or []
audio_input_data = message.get("audio") or None
if (files and enable_image_input) or (audio_input_data is not None and enable_audio_input):
try:
multimodal_service = get_multimodal_service()
processed_text = await multimodal_service.process_multimodal_input(
processed_text,
files=files if enable_image_input else [],
audio_input=audio_input_data if enable_audio_input else None,
hf_token=token_value,
prepend_multimodal=True,
)
except Exception as e:
logger.warning("multimodal_processing_failed", error=str(e))
else:
processed_text = str(message) if message else ""
return processed_text, audio_input_data
async def research_agent(
message: str | MultimodalPostprocess,
history: list[dict[str, Any]],
mode: str = "simple",
hf_model: str | None = None,
hf_provider: str | None = None,
graph_mode: str = "auto",
use_graph: bool = True,
enable_image_input: bool = True,
enable_audio_input: bool = True,
web_search_provider: str = "auto",
oauth_token: gr.OAuthToken | None = None,
oauth_profile: gr.OAuthProfile | None = None,
) -> AsyncGenerator[dict[str, Any], None]:
"""
Main research agent function that processes queries and streams results.
Args:
message: User message (text, image, or audio)
history: Conversation history
mode: Orchestrator mode
hf_model: Optional HuggingFace model ID
hf_provider: Optional inference provider
graph_mode: Graph execution mode
use_graph: Whether to use graph execution
enable_image_input: Whether to process image inputs
enable_audio_input: Whether to process audio inputs
web_search_provider: Web search provider selection
oauth_token: Gradio OAuth token (None if user not logged in)
oauth_profile: Gradio OAuth profile (None if user not logged in)
Yields:
Chat message dictionaries
"""
# Extract OAuth token and username
token_value = _extract_oauth_token(oauth_token)
username = _extract_username(oauth_profile)
# Check if user is logged in (OAuth token or env var)
# Fallback to env vars for local development or Spaces with HF_TOKEN secret
has_authentication = bool(
token_value or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
)
if not has_authentication:
yield {
"role": "assistant",
"content": (
"πŸ” **Authentication Required**\n\n"
"Please **sign in with HuggingFace** using the login button at the top of the page "
"before using this application.\n\n"
"The login button is required to access the AI models and research tools."
),
}
return
# Process multimodal input
processed_text, audio_input_data = await _process_multimodal_input(
message, enable_image_input, enable_audio_input, token_value
)
if not processed_text.strip():
yield {
"role": "assistant",
"content": "Please enter a research question or provide an image/audio input.",
}
return
# Check available keys (use token_value instead of oauth_token)
has_huggingface = bool(os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY") or token_value)
# Adjust mode if needed
effective_mode = mode
if mode == "advanced":
effective_mode = "simple"
# Yield authentication and mode status messages
async for msg in yield_auth_messages(username, token_value, has_huggingface, mode):
yield msg
# Run the agent and stream events
try:
# use_mock=False - let configure_orchestrator decide based on available keys
# It will use: OAuth token > Env vars > HF Inference (free tier)
# Convert empty strings from Textbox to None for defaults
model_id = hf_model if hf_model and hf_model.strip() else None
provider_name = hf_provider if hf_provider and hf_provider.strip() else None
# Log authentication source for debugging
auth_source = (
"OAuth"
if token_value
else (
"Env (HF_TOKEN)"
if os.getenv("HF_TOKEN")
else ("Env (HUGGINGFACE_API_KEY)" if os.getenv("HUGGINGFACE_API_KEY") else "None")
)
)
logger.info(
"Configuring orchestrator",
mode=effective_mode,
auth_source=auth_source,
has_oauth_token=bool(token_value),
model=model_id or "default",
provider=provider_name or "auto",
)
# Convert empty string to None for web_search_provider
web_search_provider_value = (
web_search_provider if web_search_provider and web_search_provider.strip() else None
)
orchestrator, backend_name = configure_orchestrator(
use_mock=False, # Never use mock in production - HF Inference is the free fallback
mode=effective_mode,
oauth_token=token_value, # Use extracted token value - passed to all agents and services
hf_model=model_id, # None will use defaults in configure_orchestrator
hf_provider=provider_name, # None will use defaults in configure_orchestrator
graph_mode=graph_mode if graph_mode else None,
use_graph=use_graph,
web_search_provider=web_search_provider_value, # None will use settings default
)
yield {
"role": "assistant",
"content": f"πŸ”§ **Backend**: {backend_name}\n\nProcessing your query...",
}
# Convert history to ModelMessage format if needed
message_history: list[ModelMessage] = []
if history:
for msg in history:
role = msg.get("role", "user")
content = msg.get("content", "")
if isinstance(content, str) and content.strip():
message_history.append(ModelMessage(role=role, content=content)) # type: ignore[operator]
# Run orchestrator and stream events
async for event in orchestrator.run(
processed_text, message_history=message_history if message_history else None
):
chat_msg = event_to_chat_message(event)
yield chat_msg
# Note: Audio output is now handled via on-demand TTS button
# Users click "Generate Audio" button to create TTS for the last response
except Exception as e:
# Return error message without metadata to avoid issues during example caching
# Metadata can cause validation errors when Gradio caches examples
# Gradio Chatbot requires plain text - remove all markdown and special characters
error_msg = str(e).replace("**", "").replace("*", "").replace("`", "")
# Ensure content is a simple string without any special formatting
yield {
"role": "assistant",
"content": f"Error: {error_msg}. Please check your configuration and try again.",
}
async def update_model_provider_dropdowns(
oauth_token: gr.OAuthToken | None = None,
oauth_profile: gr.OAuthProfile | None = None,
) -> tuple[dict[str, Any], dict[str, Any], str]:
"""Update model and provider dropdowns based on OAuth token.
This function is called when OAuth token/profile changes (user logs in/out).
It queries HuggingFace API to get available models and providers.
Args:
oauth_token: Gradio OAuth token
oauth_profile: Gradio OAuth profile
Returns:
Tuple of (model_dropdown_update, provider_dropdown_update, status_message)
"""
from src.utils.hf_model_validator import (
get_available_models,
get_available_providers,
validate_oauth_token,
)
# Extract token value
token_value: str | None = None
if oauth_token is not None:
if hasattr(oauth_token, "token"):
token_value = oauth_token.token
elif isinstance(oauth_token, str):
token_value = oauth_token
# Default values (empty = use default)
default_models = [""]
default_providers = [""]
status_msg = "⚠️ Not authenticated - using default models"
if not token_value:
# No token - return defaults
return (
gr.update(choices=default_models, value=""),
gr.update(choices=default_providers, value=""),
status_msg,
)
try:
# Validate token and get available resources
validation_result = await validate_oauth_token(token_value)
if not validation_result["is_valid"]:
status_msg = (
f"❌ Token validation failed: {validation_result.get('error', 'Unknown error')}"
)
return (
gr.update(choices=default_models, value=""),
gr.update(choices=default_providers, value=""),
status_msg,
)
# Get available models and providers
models = await get_available_models(token=token_value, limit=50)
providers = await get_available_providers(token=token_value)
# Combine with defaults
model_choices = ["", *models[:49]] # Keep first 49 + empty option
provider_choices = providers # Already includes "auto"
username = validation_result.get("username", "User")
# Build status message with warning if scope is missing
scope_warning = ""
if not validation_result["has_inference_api_scope"]:
scope_warning = (
"⚠️ Token may not have 'inference-api' scope - some models may not work\n\n"
)
status_msg = (
f"{scope_warning}βœ… Authenticated as {username}\n\n"
f"πŸ“Š Found {len(models)} available models\n"
f"πŸ”§ Found {len(providers)} available providers"
)
logger.info(
"Updated model/provider dropdowns",
model_count=len(model_choices),
provider_count=len(provider_choices),
username=username,
)
return (
gr.update(choices=model_choices, value=""),
gr.update(choices=provider_choices, value=""),
status_msg,
)
except Exception as e:
logger.error("Failed to update dropdowns", error=str(e))
status_msg = f"⚠️ Failed to load models: {e!s}"
return (
gr.update(choices=default_models, value=""),
gr.update(choices=default_providers, value=""),
status_msg,
)
def create_demo() -> gr.Blocks:
"""
Create the Gradio demo interface with MCP support and OAuth login.
Returns:
Configured Gradio Blocks interface with MCP server and OAuth enabled
"""
with gr.Blocks(title="πŸ”¬ The DETERMINATOR", fill_height=True) as demo:
# Add sidebar with login button and information
# Reference: Working implementation pattern from Gradio docs
with gr.Sidebar():
gr.Markdown("# πŸ” Authentication")
gr.Markdown(
"**Sign in with Hugging Face** to access AI models and research tools.\n\n"
"This application requires authentication to use the inference API."
)
gr.LoginButton("Sign in with Hugging Face")
gr.Markdown("---")
# About Section - Collapsible with details
with gr.Accordion("ℹ️ About", open=False):
gr.Markdown(
"**The DETERMINATOR** - Generalist Deep Research Agent\n\n"
"Stops at nothing until finding precise answers to complex questions.\n\n"
"**How It Works**:\n"
"- πŸ” Multi-source search (Web, PubMed, ClinicalTrials.gov, Europe PMC, RAG)\n"
"- 🧠 Automatic medical knowledge detection\n"
"- πŸ”„ Iterative refinement with search-judge loops\n"
"- ⏹️ Continues until budget/time/iteration limits\n"
"- πŸ“Š Evidence synthesis with citations\n\n"
"**Multimodal Input**:\n"
"- πŸ“· **Images**: Click image icon in textbox (OCR)\n"
"- 🎀 **Audio**: Click microphone icon (speech-to-text)\n"
"- πŸ“„ **Files**: Drag & drop or click to upload\n\n"
"**MCP Server**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
"⚠️ **Research tool only** - Synthesizes evidence but cannot provide medical advice."
)
gr.Markdown("---")
# Settings Section - Organized in Accordions
gr.Markdown("## βš™οΈ Settings")
# Research Configuration Accordion
with gr.Accordion("πŸ”¬ Research Configuration", open=True):
mode_radio = gr.Radio(
choices=["simple", "advanced", "iterative", "deep", "auto"],
value="simple",
label="Orchestrator Mode",
info=(
"Simple: Linear search-judge loop | "
"Advanced: Multi-agent (OpenAI) | "
"Iterative: Knowledge-gap driven | "
"Deep: Parallel sections | "
"Auto: Smart routing"
),
)
graph_mode_radio = gr.Radio(
choices=["iterative", "deep", "auto"],
value="auto",
label="Graph Research Mode",
info="Iterative: Single loop | Deep: Parallel sections | Auto: Detect from query",
)
use_graph_checkbox = gr.Checkbox(
value=True,
label="Use Graph Execution",
info="Enable graph-based workflow execution",
)
# Model and Provider selection
gr.Markdown("### πŸ€– Model & Provider")
# Status message for model/provider loading
model_provider_status = gr.Markdown(
value="⚠️ Sign in to see available models and providers",
visible=True,
)
# Popular models list (will be updated by validator)
popular_models = [
"", # Empty = use default
"Qwen/Qwen3-Next-80B-A3B-Thinking",
"Qwen/Qwen3-235B-A22B-Instruct-2507",
"zai-org/GLM-4.5-Air",
"meta-llama/Llama-3.1-8B-Instruct",
"meta-llama/Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-9b-it",
]
hf_model_dropdown = gr.Dropdown(
choices=popular_models,
value="", # Empty string - will be converted to None in research_agent
label="Reasoning Model",
info="Select a HuggingFace model (leave empty for default). Sign in to see all available models.",
allow_custom_value=True, # Allow users to type custom model IDs
)
# Provider list from README (will be updated by validator)
providers = [
"", # Empty string = auto-select
"nebius",
"together",
"scaleway",
"hyperbolic",
"novita",
"nscale",
"sambanova",
"ovh",
"fireworks",
]
hf_provider_dropdown = gr.Dropdown(
choices=providers,
value="", # Empty string - will be converted to None in research_agent
label="Inference Provider",
info="Select inference provider (leave empty for auto-select). Sign in to see all available providers.",
)
# Refresh button for updating models/providers after login
def refresh_models_and_providers(
request: gr.Request,
) -> tuple[dict[str, Any], dict[str, Any], str]:
"""Handle refresh button click and update dropdowns."""
import asyncio
# Extract OAuth token and profile from request
oauth_token: gr.OAuthToken | None = None
oauth_profile: gr.OAuthProfile | None = None
if request is not None:
# Try to get OAuth token from request
if hasattr(request, "oauth_token"):
oauth_token = request.oauth_token
if hasattr(request, "oauth_profile"):
oauth_profile = request.oauth_profile
# Run async function in sync context
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
update_model_provider_dropdowns(oauth_token, oauth_profile)
)
return result
finally:
loop.close()
refresh_models_btn = gr.Button(
value="πŸ”„ Refresh Available Models",
visible=True,
size="sm",
)
# Pass request to get OAuth token from Gradio context
refresh_models_btn.click(
fn=refresh_models_and_providers,
inputs=[], # Request is automatically available in Gradio context
outputs=[hf_model_dropdown, hf_provider_dropdown, model_provider_status],
)
# Web Search Provider selection
gr.Markdown("### πŸ” Web Search Provider")
# Available providers with labels indicating availability
# Format: (display_label, value) - Gradio Dropdown supports tuples
web_search_provider_options = [
("Auto-detect (Recommended)", "auto"),
("Serper (Google Search + Full Content)", "serper"),
("DuckDuckGo (Free, Snippets Only)", "duckduckgo"),
("SearchXNG (Self-hosted) - Coming Soon", "searchxng"), # Not fully implemented
("Brave - Coming Soon", "brave"), # Not implemented
("Tavily - Coming Soon", "tavily"), # Not implemented
]
# Create Dropdown with label-value pairs
# Gradio will display labels but return values
# Disabled options are marked with "Coming Soon" in the label
# The factory will handle "not implemented" cases gracefully
web_search_provider_dropdown = gr.Dropdown(
choices=web_search_provider_options,
value="auto",
label="Web Search Provider",
info="Select web search provider. 'Auto' detects best available.",
)
# Multimodal Input Configuration
gr.Markdown("### πŸ“·πŸŽ€ Multimodal Input")
enable_image_input_checkbox = gr.Checkbox(
value=settings.enable_image_input,
label="Enable Image Input (OCR)",
info="Process uploaded images with OCR",
)
enable_audio_input_checkbox = gr.Checkbox(
value=settings.enable_audio_input,
label="Enable Audio Input (STT)",
info="Process uploaded/recorded audio with speech-to-text",
)
# Audio Output Configuration - Collapsible
with gr.Accordion("πŸ”Š Audio Output (TTS)", open=False):
gr.Markdown(
"**Generate audio for research responses on-demand.**\n\n"
"Enter Modal keys below or set `MODAL_TOKEN_ID`/`MODAL_TOKEN_SECRET` in `.env` for local development."
)
with gr.Accordion("πŸ”‘ Modal Credentials (Optional)", open=False):
modal_token_id_input = gr.Textbox(
label="Modal Token ID",
placeholder="ak-... (leave empty to use .env)",
type="password",
value="",
)
modal_token_secret_input = gr.Textbox(
label="Modal Token Secret",
placeholder="as-... (leave empty to use .env)",
type="password",
value="",
)
with gr.Accordion("🎚️ Voice & Quality Settings", open=False):
tts_voice_dropdown = gr.Dropdown(
choices=[
"af_heart",
"af_bella",
"af_sarah",
"af_sky",
"af_nova",
"af_shimmer",
"af_echo",
"af_fable",
"af_onyx",
"af_angel",
"af_asteria",
"af_jessica",
"af_elli",
"af_domi",
"af_gigi",
"af_freya",
"af_glinda",
"af_cora",
"af_serena",
"af_liv",
"af_naomi",
"af_rachel",
"af_antoni",
"af_thomas",
"af_charlie",
"af_emily",
"af_george",
"af_arnold",
"af_adam",
"af_sam",
"af_paul",
"af_josh",
"af_daniel",
"af_liam",
"af_dave",
"af_fin",
"af_sarah",
"af_glinda",
"af_grace",
"af_dorothy",
"af_michael",
"af_james",
"af_joseph",
"af_jeremy",
"af_ryan",
"af_oliver",
"af_harry",
"af_kyle",
"af_leo",
"af_otto",
"af_owen",
"af_pepper",
"af_phil",
"af_raven",
"af_rocky",
"af_rusty",
"af_serena",
"af_sky",
"af_spark",
"af_stella",
"af_storm",
"af_taylor",
"af_vera",
"af_will",
"af_aria",
"af_ash",
"af_ballad",
"af_bella",
"af_breeze",
"af_cove",
"af_dusk",
"af_ember",
"af_flash",
"af_flow",
"af_glow",
"af_harmony",
"af_journey",
"af_lullaby",
"af_lyra",
"af_melody",
"af_midnight",
"af_moon",
"af_muse",
"af_music",
"af_narrator",
"af_nightingale",
"af_poet",
"af_rain",
"af_redwood",
"af_rewind",
"af_river",
"af_sage",
"af_seashore",
"af_shadow",
"af_silver",
"af_song",
"af_starshine",
"af_story",
"af_summer",
"af_sun",
"af_thunder",
"af_tide",
"af_time",
"af_valentino",
"af_verdant",
"af_verse",
"af_vibrant",
"af_vivid",
"af_warmth",
"af_whisper",
"af_wilderness",
"af_willow",
"af_winter",
"af_wit",
"af_witness",
"af_wren",
"af_writer",
"af_zara",
"af_zeus",
"af_ziggy",
"af_zoom",
"af_river",
"am_michael",
"am_fenrir",
"am_puck",
"am_echo",
"am_eric",
"am_liam",
"am_onyx",
"am_santa",
"am_adam",
],
value=settings.tts_voice,
label="TTS Voice",
info="Select TTS voice (American English voices: af_*, am_*)",
)
tts_speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=settings.tts_speed,
step=0.1,
label="TTS Speech Speed",
info="Adjust TTS speech speed (0.5x to 2.0x)",
)
gr.Dropdown(
choices=["T4", "A10", "A100", "L4", "L40S"],
value=settings.tts_gpu or "T4",
label="TTS GPU Type",
info="Modal GPU type for TTS (T4 is cheapest, A100 is fastest). Note: GPU changes require app restart.",
visible=settings.modal_available,
interactive=False, # GPU type set at function definition time, requires restart
)
tts_use_llm_polish_checkbox = gr.Checkbox(
value=settings.tts_use_llm_polish,
label="Use LLM Polish for Audio",
info="Apply LLM-based final polish to remove remaining formatting artifacts (costs API calls)",
)
tts_generate_button = gr.Button(
"🎡 Generate Audio for Last Response",
variant="primary",
size="lg",
)
tts_status_text = gr.Markdown(
"Click the button above to generate audio for the last research response.",
elem_classes="tts-status",
)
# Audio output component (for TTS response)
audio_output = gr.Audio(
label="πŸ”Š Audio Output",
visible=True,
)
# TTS on-demand generation handler
async def handle_tts_generation(
history: list[dict[str, Any]],
modal_token_id: str,
modal_token_secret: str,
voice: str,
speed: float,
use_llm_polish: bool,
) -> tuple[Any | None, str]:
"""Generate audio on-demand for the last response.
Args:
history: Chat history
modal_token_id: Modal token ID from UI
modal_token_secret: Modal token secret from UI
voice: TTS voice selection
speed: TTS speed
use_llm_polish: Enable LLM polish
Returns:
Tuple of (audio_output, status_message)
"""
from src.services.tts_modal import generate_audio_on_demand
# Get last assistant message from history
# History is a list of tuples: [(user_msg, assistant_msg), ...]
if not history:
logger.warning("tts_no_history", history=history)
return None, "❌ No messages in history to generate audio for"
# Debug: Log history format
logger.info(
"tts_history_debug",
history_type=type(history).__name__,
history_length=len(history) if isinstance(history, list) else 0,
first_entry_type=type(history[0]).__name__
if isinstance(history, list) and len(history) > 0
else None,
first_entry_sample=str(history[0])[:200]
if isinstance(history, list) and len(history) > 0
else None,
)
# Get the last assistant message (second element of last tuple)
last_message = None
if isinstance(history, list) and len(history) > 0:
last_entry = history[-1]
# ChatInterface format: (user_message, assistant_message)
if isinstance(last_entry, (tuple, list)) and len(last_entry) >= 2:
last_message = last_entry[1]
logger.info(
"tts_extracted_from_tuple", message_type=type(last_message).__name__
)
# Dict format: {"role": "assistant", "content": "..."}
elif isinstance(last_entry, dict):
if last_entry.get("role") == "assistant":
content = last_entry.get("content", "")
# Content might be a list (multimodal) or string
if isinstance(content, list):
# Extract text from multimodal content list
last_message = " ".join(str(item) for item in content if item)
else:
last_message = content
logger.info(
"tts_extracted_from_dict",
message_type=type(content).__name__,
message_length=len(last_message)
if isinstance(last_message, str)
else 0,
)
else:
logger.warning(
"tts_unknown_format",
entry_type=type(last_entry).__name__,
entry=str(last_entry)[:200],
)
# Also handle if last_message itself is a list
if isinstance(last_message, list):
last_message = " ".join(str(item) for item in last_message if item)
if not last_message or not isinstance(last_message, str) or not last_message.strip():
logger.error(
"tts_no_message_found",
last_message_type=type(last_message).__name__ if last_message else None,
last_message_value=str(last_message)[:100] if last_message else None,
)
return None, "❌ No assistant response found in history"
# Generate audio
audio_output, status_message = await generate_audio_on_demand(
text=last_message,
modal_token_id=modal_token_id,
modal_token_secret=modal_token_secret,
voice=voice,
speed=speed,
use_llm_polish=use_llm_polish,
)
return audio_output, status_message
# Chat interface with multimodal support
# Examples are provided but will NOT run at startup (cache_examples=False)
# Users must log in first before using examples or submitting queries
chat_interface = gr.ChatInterface(
fn=research_agent,
multimodal=True, # Enable multimodal input (text + images + audio)
title="πŸ”¬ The DETERMINATOR",
description=(
"*Generalist Deep Research Agent β€” stops at nothing until finding precise answers*\n\n"
"πŸ’‘ **Quick Start**: Type your research question below. Use πŸ“· for images, 🎀 for audio.\n\n"
"⚠️ **Sign in with HuggingFace** (sidebar) before starting."
),
examples=[
# When additional_inputs are provided, examples must be lists of lists
# Each inner list: [message, mode, hf_model, hf_provider, graph_mode, multimodal_enabled]
# Using actual model IDs and provider names from inference_models.py
# Note: Provider is optional - if empty, HF will auto-select
# These examples will NOT run at startup - users must click them after logging in
# All examples require deep iterative search and information retrieval across multiple sources
[
# Medical research example (only one medical example)
"Create a comprehensive report on Long COVID treatments including clinical trials, mechanisms, and safety.",
"deep",
"zai-org/GLM-4.5-Air",
"nebius",
"deep",
True,
],
[
# Technical/Engineering example requiring deep research
"Analyze the current state of quantum computing architectures: compare different qubit technologies, error correction methods, and scalability challenges across major platforms including IBM, Google, and IonQ.",
"deep",
"Qwen/Qwen3-Next-80B-A3B-Thinking",
"nebius",
"deep",
True,
],
[
# Historical/Social Science example
"Research and synthesize information about the economic impact of the Industrial Revolution on European social structures, including changes in class dynamics, urbanization patterns, and labor movements from 1750-1900.",
"deep",
"meta-llama/Llama-3.1-70B-Instruct",
"together",
"deep",
True,
],
[
# Scientific/Physics example
"Investigate the latest developments in fusion energy research: compare ITER, SPARC, and other major projects, analyze recent breakthroughs in plasma confinement, and assess the timeline to commercial fusion power.",
"deep",
"Qwen/Qwen3-235B-A22B-Instruct-2507",
"hyperbolic",
"deep",
True,
],
[
# Technology/Business example
"Research the competitive landscape of AI chip manufacturers: analyze NVIDIA, AMD, Intel, and emerging players, compare architectures (GPU vs. TPU vs. NPU), and assess market positioning and future trends.",
"deep",
"zai-org/GLM-4.5-Air",
"fireworks",
"deep",
True,
],
],
additional_inputs=[
mode_radio,
hf_model_dropdown,
hf_provider_dropdown,
graph_mode_radio,
use_graph_checkbox,
enable_image_input_checkbox,
enable_audio_input_checkbox,
web_search_provider_dropdown,
# Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
],
cache_examples=False, # Don't cache examples - requires authentication
)
# Wire up TTS generation button
tts_generate_button.click(
fn=handle_tts_generation,
inputs=[
chat_interface.chatbot, # Get chat history from ChatInterface
modal_token_id_input,
modal_token_secret_input,
tts_voice_dropdown,
tts_speed_slider,
tts_use_llm_polish_checkbox,
],
outputs=[audio_output, tts_status_text],
)
return demo # type: ignore[no-any-return]
if __name__ == "__main__":
demo = create_demo()
demo.launch(server_name="0.0.0.0", server_port=7860)