Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,403 Bytes
3cb2cee 15681d2 51bbb25 15681d2 3cb2cee 15681d2 3cb2cee 15681d2 3cb2cee 15681d2 fb5663b 15681d2 8c2f3b3 15681d2 8c2f3b3 15681d2 accdb59 8c2f3b3 15681d2 8c2f3b3 4beb509 15681d2 8c2f3b3 15681d2 9cf8b3c 15681d2 8c2f3b3 3cb2cee 8c2f3b3 90546b3 8c2f3b3 15681d2 a96e7fc 15681d2 8c2f3b3 15681d2 8c2f3b3 15681d2 8c2f3b3 3cb2cee 15681d2 3cb2cee 15681d2 3cb2cee 15681d2 3cb2cee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
"""
Stable Audio Open Gradio Inference App for HuggingFace Spaces
This app provides a simple interface for generating high-quality instrumental music
using Stable Audio Open with the SAO-Instrumental-Finetune model.
Designed to be used as a remote computation tool for WeaveMuse.
Architecture:
- Stable Audio model is loaded OUTSIDE the GPU-decorated function
- Only the inference itself runs on GPU (cost-efficient for HF Spaces Zero GPU)
- Model initialization happens once at startup
"""
import torch
import torchaudio
from einops import rearrange
import gradio as gr
import spaces
import os
import uuid
import numpy as np
# Importing the model-related functions
from stable_audio_tools.inference.generation import generate_diffusion_cond
import json
from stable_audio_tools.models.factory import create_model_from_config
from stable_audio_tools.models.utils import load_ckpt_state_dict
from huggingface_hub import hf_hub_download
def get_pretrained_model(name="santifiorino/SAO-Instrumental-Finetune"):
model_config_path = hf_hub_download(name, filename="model_config.json", repo_type='model')
with open(model_config_path) as f:
model_config = json.load(f)
model = create_model_from_config(model_config)
# Try to download the model.safetensors file first, if it doesn't exist, download the model.ckpt file
try:
model_ckpt_path = hf_hub_download(name, filename="model.safetensors", repo_type='model')
except Exception as e:
model_ckpt_path = hf_hub_download(name, filename="SAO_Instrumental_Finetune.ckpt", repo_type='model')
model.load_state_dict(load_ckpt_state_dict(model_ckpt_path))
return model, model_config
# Load the model outside of the GPU-decorated function
def load_model():
"""
Load the Stable Audio model outside GPU function.
This is called once at startup to download and cache the model.
"""
print("Loading model...")
model, model_config = get_pretrained_model("santifiorino/SAO-Instrumental-Finetune")
print("Model loaded successfully.")
return model, model_config
import numpy as np
import gradio as gr
import spaces
import torch
from einops import rearrange
# --- load once, keep global (don’t reload inside GPU fn) ---
model, model_config = load_model()
model = model.to("cuda")
SAMPLE_RATE = model_config["sample_rate"]
SAMPLE_SIZE = model_config["sample_size"]
@spaces.GPU()
def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
"""
Returns (sample_rate, waveform) so the API returns raw audio, not a file.
"""
conditioning = [{
"prompt": prompt,
"seconds_start": 0,
"seconds_total": seconds_total
}]
audio = generate_diffusion_cond(
model,
steps=steps,
cfg_scale=cfg_scale,
conditioning=conditioning,
sample_size=SAMPLE_SIZE,
sigma_min=0.3,
sigma_max=500,
sampler_type="dpmpp-3m-sde",
device="cuda",
)
# [B, C, N] -> [C, B*N] -> [N, C] for Gradio
audio = rearrange(audio, "b c n -> c (b n)") # (C, T)
audio = audio.to(torch.float32)
audio = audio / (audio.abs().max() + 1e-12) # peak normalize
audio = (audio.clamp(-1, 1) * 32767).to(torch.int16) # int16
audio_np = audio.cpu().numpy().T # (T, C)
return SAMPLE_RATE, audio_np
# You no longer need save_audio_to_file() or inf_function()
# Wire the function directly and keep output as numpy (no filepaths!)
interface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Prompt",
placeholder="Describe the instrumental music...",
value="Upbeat rock guitar with drums and bass"),
gr.Slider(0, 47, value=30, label="Duration in Seconds"),
gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale"),
],
outputs=gr.Audio(type="numpy", format="wav", label="Generated Music"),
api_name="generate", # your API endpoint will be /generate (default is /predict)
title="🎸 Stable Audio Instrumental Generator",
description="""
Generate high-quality instrumental music at 44.1kHz from text prompts using the SAO-Instrumental-Finetune model.
**Features:**
- 🎹 Piano, guitar, drums, bass, and orchestral instruments
- 🎵 Various musical genres and styles
- ⚡ High-quality stereo audio
- 🎼 Perfect for music composition and production
**Tips:**
- Be specific about instruments, tempo, and mood
- Higher steps = better quality (recommended: 100-120)
- CFG Scale 7-10 works well for most prompts
""",
examples=[
[
"Energetic rock guitar riff with powerful drums and bass",
30,
100,
7,
],
[
"Smooth jazz piano trio with upright bass and brushed drums",
35,
110,
8,
],
[
"Epic orchestral strings and brass with cinematic percussion",
45,
120,
10,
],
[
"Funky electric bass groove with rhythm guitar and tight drums",
30,
100,
7,
],
[
"Acoustic guitar fingerpicking with soft percussion",
40,
110,
6,
],
[
"Electronic synthesizer pads with ambient textures and subtle beats",
35,
100,
7.5,
],
[
"Classical piano solo with expressive dynamics and sustain pedal",
30,
110,
8,
],
[
"Blues guitar solo with bending notes over a shuffle rhythm section",
30,
100,
7,
],
[
"Latin percussion ensemble with congas, bongos, and timbales",
30,
100,
7,
],
[
"Rock beat played in a treated studio, session drumming on an acoustic kit",
30,
100,
7,
]
],
article="""
---
### About SAO-Instrumental-Finetune
This model is a fine-tuned version of **Stable Audio Open 1.0** specifically trained for instrumental music generation.
**Capabilities:**
- 🎸 **Guitar**: Acoustic, electric, classical, jazz, rock
- 🥁 **Drums**: Rock, jazz, electronic, orchestral percussion
- 🎹 **Piano**: Classical, jazz, modern, ambient
- � **Orchestral**: Strings, brass, woodwinds
- � **Other**: Bass, synthesizers, ethnic instruments
**Technical Details:**
- Model: SAO-Instrumental-Finetune (based on Stable Audio Open 1.0)
- Sample Rate: 44.1kHz (CD quality)
- Max Duration: 47 seconds
- Architecture: Latent diffusion model with conditioning
**Integration:**
This space is designed to work with **WeaveMuse** for AI-assisted music composition.
Use the API endpoint for programmatic access in your music production workflows.
---
*Powered by [Stability AI](https://stability.ai/) and [WeaveMuse](https://github.com/manoskary/weavemuse)*
"""
)
# Launch the Interface
if __name__ == "__main__":
interface.launch()
|