""" Stable Audio Open Gradio Inference App for HuggingFace Spaces This app provides a simple interface for generating high-quality instrumental music using Stable Audio Open with the SAO-Instrumental-Finetune model. Designed to be used as a remote computation tool for WeaveMuse. Architecture: - Stable Audio model is loaded OUTSIDE the GPU-decorated function - Only the inference itself runs on GPU (cost-efficient for HF Spaces Zero GPU) - Model initialization happens once at startup """ import torch import torchaudio from einops import rearrange import gradio as gr import spaces import os import uuid import numpy as np # Importing the model-related functions from stable_audio_tools.inference.generation import generate_diffusion_cond import json from stable_audio_tools.models.factory import create_model_from_config from stable_audio_tools.models.utils import load_ckpt_state_dict from huggingface_hub import hf_hub_download def get_pretrained_model(name="santifiorino/SAO-Instrumental-Finetune"): model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model') with open(model_config_path) as f: model_config = json.load(f) model = create_model_from_config(model_config) # Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file try: model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model') except Exception as e: model_ckpt_path = hf_hub_download(name, filename="SAO_Instrumental_Finetune.ckpt", repo_type='model') model.load_state_dict(load_ckpt_state_dict(model_ckpt_path)) return model, model_config # Load the model outside of the GPU-decorated function def load_model(): """ Load the Stable Audio model outside GPU function. This is called once at startup to download and cache the model. """ print("Loading model...") model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune") print("Model loaded successfully.") return model, model_config import numpy as np import gradio as gr import spaces import torch from einops import rearrange # --- load once, keep global (don’t reload inside GPU fn) --- model, model_config = load_model() model = model.to("cuda") SAMPLE_RATE = model_config["sample_rate"] SAMPLE_SIZE = model_config["sample_size"] @spaces.GPU() def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7): """ Returns (sample_rate, waveform) so the API returns raw audio, not a file. """ conditioning = [{ "prompt": prompt, "seconds_start": 0, "seconds_total": seconds_total }] audio = generate_diffusion_cond( model, steps=steps, cfg_scale=cfg_scale, conditioning=conditioning, sample_size=SAMPLE_SIZE, sigma_min=0.3, sigma_max=500, sampler_type="dpmpp-3m-sde", device="cuda", ) # [B, C, N] -> [C, B*N] -> [N, C] for Gradio audio = rearrange(audio, "b c n -> c (b n)") # (C, T) audio = audio.to(torch.float32) audio = audio / (audio.abs().max() + 1e-12) # peak normalize audio = (audio.clamp(-1, 1) * 32767).to(torch.int16) # int16 audio_np = audio.cpu().numpy().T # (T, C) return SAMPLE_RATE, audio_np # You no longer need save_audio_to_file() or inf_function() # Wire the function directly and keep output as numpy (no filepaths!) interface = gr.Interface( fn=generate_audio, inputs=[ gr.Textbox(label="Prompt", placeholder="Describe the instrumental music...", value="Upbeat rock guitar with drums and bass"), gr.Slider(0, 47, value=30, label="Duration in Seconds"), gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"), gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale"), ], outputs=gr.Audio(type="numpy", format="wav", label="Generated Music"), api_name="generate", # your API endpoint will be /generate (default is /predict) title="🎸 Stable Audio Instrumental Generator", description=""" Generate high-quality instrumental music at 44.1kHz from text prompts using the SAO-Instrumental-Finetune model. **Features:** - 🎹 Piano, guitar, drums, bass, and orchestral instruments - 🎵 Various musical genres and styles - ⚡ High-quality stereo audio - 🎼 Perfect for music composition and production **Tips:** - Be specific about instruments, tempo, and mood - Higher steps = better quality (recommended: 100-120) - CFG Scale 7-10 works well for most prompts """, examples=[ [ "Energetic rock guitar riff with powerful drums and bass", 30, 100, 7, ], [ "Smooth jazz piano trio with upright bass and brushed drums", 35, 110, 8, ], [ "Epic orchestral strings and brass with cinematic percussion", 45, 120, 10, ], [ "Funky electric bass groove with rhythm guitar and tight drums", 30, 100, 7, ], [ "Acoustic guitar fingerpicking with soft percussion", 40, 110, 6, ], [ "Electronic synthesizer pads with ambient textures and subtle beats", 35, 100, 7.5, ], [ "Classical piano solo with expressive dynamics and sustain pedal", 30, 110, 8, ], [ "Blues guitar solo with bending notes over a shuffle rhythm section", 30, 100, 7, ], [ "Latin percussion ensemble with congas, bongos, and timbales", 30, 100, 7, ], [ "Rock beat played in a treated studio, session drumming on an acoustic kit", 30, 100, 7, ] ], article=""" --- ### About SAO-Instrumental-Finetune This model is a fine-tuned version of **Stable Audio Open 1.0** specifically trained for instrumental music generation. **Capabilities:** - 🎸 **Guitar**: Acoustic, electric, classical, jazz, rock - 🥁 **Drums**: Rock, jazz, electronic, orchestral percussion - 🎹 **Piano**: Classical, jazz, modern, ambient - � **Orchestral**: Strings, brass, woodwinds - � **Other**: Bass, synthesizers, ethnic instruments **Technical Details:** - Model: SAO-Instrumental-Finetune (based on Stable Audio Open 1.0) - Sample Rate: 44.1kHz (CD quality) - Max Duration: 47 seconds - Architecture: Latent diffusion model with conditioning **Integration:** This space is designed to work with **WeaveMuse** for AI-assisted music composition. Use the API endpoint for programmatic access in your music production workflows. --- *Powered by [Stability AI](https://stability.ai/) and [WeaveMuse](https://github.com/manoskary/weavemuse)* """ ) # Launch the Interface if __name__ == "__main__": interface.launch()