Improve setup for multi-GPU support and fix inference docs

- install_dependencies.sh: auto-detect GPU compute capability and
install PyTorch 2.10.0+cu128 for Blackwell (sm_120+) or
2.5.1+cu124 for older GPUs; add cuDNN library path fix
- requirements.txt: remove torch version pins (managed by install
script), add missing huggingface_hub/requests deps, clarify that
chatterbox TTS is vendored in src/
- devcontainer.json: fix name, fix wrong repo path, add git-lfs pull,
auto-clone from HF if starting from blank container, run full setup
in postCreateCommand
- README.md: add Quick Start section with GPU compat notes; fix
inference example (was using nonexistent ChatterboxMultilingualTTS,
now uses correct ChatterboxTTS from src.chatterbox_.tts)

Files changed (4) hide show

.devcontainer/devcontainer.json +17 -8
README.md +53 -11
install_dependencies.sh +82 -18
requirements.txt +13 -11

.devcontainer/devcontainer.json CHANGED Viewed

@@ -1,21 +1,26 @@
 {
-    "name": "Chatterbox A100 Optimized",
     "image": "unsloth/unsloth:2025.10.1-pt2.8.0-cu12.8-llamacpp-integration",
     "forwardPorts": [8888],
     "containerEnv": {
         "JUPYTER_PASSWORD": "MASKED_PASSWORD",
-        "USER_PASSWORD": "unsloth"
     },
     "runArgs": [
         "--gpus=all",
         "--shm-size=64gb"
     ],
     "remoteUser": "root",
     "customizations": {
         "vscode": {
             "extensions": [
@@ -26,5 +31,9 @@
         }
     },
-    "postCreateCommand": "apt-get update && apt-get install -y git ffmpeg libsndfile1 && chmod -R 777 /workspaces && cd /workspaces/work/chatterbox-finetuning"
 }

 {
+    "name": "Chatterbox Finnish TTS",
+    // Unsloth image with CUDA 12.8 — supports Blackwell (sm_120+) and older GPUs.
+    // install_dependencies.sh selects the right PyTorch build automatically.
     "image": "unsloth/unsloth:2025.10.1-pt2.8.0-cu12.8-llamacpp-integration",
     "forwardPorts": [8888],
     "containerEnv": {
         "JUPYTER_PASSWORD": "MASKED_PASSWORD",
+        "USER_PASSWORD": "unsloth",
+        // Optional: set your HuggingFace token here if the repo is private
+        "HF_TOKEN": ""
     },
     "runArgs": [
         "--gpus=all",
         "--shm-size=64gb"
     ],
     "remoteUser": "root",
     "customizations": {
         "vscode": {
             "extensions": [
         }
     },
+    // postCreateCommand handles two cases:
+    //   A) Standard VS Code / Codespace flow: repo is already cloned, just pull LFS weights
+    //   B) Blank container (e.g. docker run): clones the full repo from HuggingFace first
+    // Then in both cases: install dependencies and download pretrained base models.
+    "postCreateCommand": "apt-get update -qq && apt-get install -y git-lfs ffmpeg libsndfile1 && git lfs install && if [ ! -f inference_example.py ]; then git clone https://huggingface.co/Finnish-NLP/Chatterbox-Finnish /workspace/Chatterbox-Finnish && cd /workspace/Chatterbox-Finnish; fi && git lfs pull && bash install_dependencies.sh && python setup.py"
 }

README.md CHANGED Viewed

@@ -101,27 +101,69 @@ We used `sweep_params.py` to identify the "Golden Settings" for the most natural
 ---
 ## 🏃 Running Inference
 ```python
-from src.chatterbox_.mtl_tts import ChatterboxMultilingualTTS
-# 1. Load the engine
-engine = ChatterboxMultilingualTTS.from_local("./pretrained_models", device="cuda")
-# 2. Inject weights (e.g., best_finnish_multilingual_cp986.safetensors)
-# engine.t3.load_state_dict(...)
 # 3. Generate with Finnish-optimized parameters
 wav = engine.generate(
-    text="Suomen kieli on poikkeuksellisen kaunista kuunneltavaa.",
-    language_id="fi",
-    audio_prompt_path="path/to/reference.wav",
-    repetition_penalty=1.5,
     temperature=0.8,
-    exaggeration=0.5,
-    cfg_weight=0.3
 )
 ```
 ---

 ---
+## 🚀 Quick Start
+### Option A — Dev Container (recommended)
+Open this repo in VS Code with the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension. Everything — dependencies, base model weights, GPU detection — is handled automatically by `postCreateCommand`.
+### Option B — Manual Setup
+```bash
+# 1. Clone (with LFS for model weights)
+git clone https://huggingface.co/Finnish-NLP/Chatterbox-Finnish
+cd Chatterbox-Finnish
+# 2. Install dependencies (auto-detects your GPU architecture)
+bash install_dependencies.sh
+# 3. Download pretrained base models from ResembleAI
+python setup.py
+# 4. Run inference
+python inference_example.py
+```
+> **GPU compatibility:** The install script detects your GPU and picks the right PyTorch build automatically:
+> - **Blackwell (sm_120+)** e.g. RTX PRO 6000 → PyTorch 2.10.0 + CUDA 12.8
+> - **Older GPUs (A100, RTX 30/40xx, etc.)** → PyTorch 2.5.1 + CUDA 12.4
+---
 ## 🏃 Running Inference
 ```python
+import torch
+import soundfile as sf
+from src.chatterbox_.tts import ChatterboxTTS
+from safetensors.torch import load_file
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# 1. Load the base engine
+engine = ChatterboxTTS.from_local("./pretrained_models", device=device)
+# 2. Inject Finnish fine-tuned weights
+checkpoint = load_file("./models/best_finnish_multilingual_cp986.safetensors")
+t3_state = {k[3:] if k.startswith("t3.") else k: v for k, v in checkpoint.items()}
+engine.t3.load_state_dict(t3_state, strict=False)
 # 3. Generate with Finnish-optimized parameters
 wav = engine.generate(
+    text="Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.",
+    audio_prompt_path="./samples/reference_finnish.wav",
+    repetition_penalty=1.2,
     temperature=0.8,
+    exaggeration=0.6,
 )
+sf.write("output.wav", wav.squeeze().cpu().numpy(), engine.sr)
+```
+Or just run the included example script directly:
+```bash
+python inference_example.py  # outputs output_finnish.wav
 ```
 ---

install_dependencies.sh CHANGED Viewed

@@ -1,33 +1,69 @@
 #!/bin/bash
-# Chatterbox Finetuning - Dependency Installation Script
-# This script ensures correct PyTorch and dependency versions are installed
 set -e  # Exit on error
 echo "===================================="
-echo "Chatterbox Finetuning Setup"
 echo "===================================="
 # Check Python version
 PYTHON_VERSION=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
 echo "Python version: $PYTHON_VERSION"
-# Uninstall conflicting packages if they exist
 echo ""
 echo "Step 1: Removing conflicting packages..."
 pip uninstall -y torch torchvision torchaudio xformers flash-attn 2>/dev/null || true
-# Install correct PyTorch version
 echo ""
-echo "Step 2: Installing PyTorch 2.5.1 with CUDA 12.4..."
-pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
 # Install xformers
 echo ""
-echo "Step 3: Installing xformers..."
-pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu124
-# Install torchao (compatible version)
 echo ""
 echo "Step 4: Installing torchao..."
 pip install torchao==0.6.1
@@ -37,6 +73,31 @@ echo ""
 echo "Step 5: Installing remaining dependencies..."
 pip install -r requirements.txt
 # Verify installation
 echo ""
 echo "===================================="
@@ -46,12 +107,14 @@ python -c "
 import torch
 import xformers
 import transformers
-print(f'✓ PyTorch: {torch.__version__}')
-print(f'✓ xformers: {xformers.__version__}')
-print(f'✓ Transformers: {transformers.__version__}')
-print(f'✓ CUDA available: {torch.cuda.is_available()}')
 if torch.cuda.is_available():
-    print(f'✓ CUDA version: {torch.version.cuda}')
 "
 echo ""
@@ -60,6 +123,7 @@ echo "Installation complete!"
 echo "===================================="
 echo ""
 echo "Next steps:"
-echo "1. Run: python setup.py           (to download pretrained models)"
-echo "2. Run: python train.py           (to start training)"
-echo ""

 #!/bin/bash
+# Chatterbox Finnish TTS - Dependency Installation Script
+# Automatically selects the correct PyTorch/CUDA version for your GPU.
 set -e  # Exit on error
 echo "===================================="
+echo "Chatterbox Finnish TTS Setup"
 echo "===================================="
 # Check Python version
 PYTHON_VERSION=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
 echo "Python version: $PYTHON_VERSION"
+# Detect GPU compute capability and select appropriate PyTorch build
+echo ""
+echo "Detecting GPU architecture..."
+IS_BLACKWELL=$(python -c "
+import subprocess
+try:
+    r = subprocess.run(
+        ['nvidia-smi', '--query-gpu=compute_cap', '--format=csv,noheader'],
+        capture_output=True, text=True
+    )
+    caps = [float(c.strip()) for c in r.stdout.strip().splitlines() if c.strip()]
+    print('1' if caps and max(caps) >= 12.0 else '0')
+except Exception:
+    print('0')
+" 2>/dev/null || echo "0")
+if [ "$IS_BLACKWELL" = "1" ]; then
+    echo "Blackwell GPU detected (sm_120+) — using PyTorch 2.10.0 + CUDA 12.8"
+    TORCH_VERSION="2.10.0"
+    TORCHVISION_VERSION="0.25.0"
+    TORCHAUDIO_VERSION="2.10.0"
+    CUDA_TAG="cu128"
+    XFORMERS_VERSION="0.0.35"
+else
+    echo "Pre-Blackwell GPU detected — using PyTorch 2.5.1 + CUDA 12.4"
+    TORCH_VERSION="2.5.1"
+    TORCHVISION_VERSION="0.20.1"
+    TORCHAUDIO_VERSION="2.5.1"
+    CUDA_TAG="cu124"
+    XFORMERS_VERSION="0.0.28.post3"
+fi
+# Uninstall conflicting packages
 echo ""
 echo "Step 1: Removing conflicting packages..."
 pip uninstall -y torch torchvision torchaudio xformers flash-attn 2>/dev/null || true
+# Install PyTorch
 echo ""
+echo "Step 2: Installing PyTorch ${TORCH_VERSION} with CUDA ${CUDA_TAG}..."
+pip install \
+    torch==${TORCH_VERSION} \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/${CUDA_TAG}
 # Install xformers
 echo ""
+echo "Step 3: Installing xformers ${XFORMERS_VERSION}..."
+pip install xformers==${XFORMERS_VERSION} --index-url https://download.pytorch.org/whl/${CUDA_TAG}
+# Install torchao (compatible with both PyTorch versions)
 echo ""
 echo "Step 4: Installing torchao..."
 pip install torchao==0.6.1
 echo "Step 5: Installing remaining dependencies..."
 pip install -r requirements.txt
+# Fix potential cuDNN conflict: ensure PyTorch's bundled cuDNN takes priority
+echo ""
+echo "Step 6: Configuring cuDNN library path..."
+CUDNN_PATH=$(python -c "
+import os
+try:
+    import nvidia.cudnn
+    print(os.path.join(os.path.dirname(nvidia.cudnn.__file__), 'lib'))
+except Exception:
+    print('')
+" 2>/dev/null)
+if [ -n "$CUDNN_PATH" ] && [ -d "$CUDNN_PATH" ]; then
+    PROFILE_LINE="export LD_LIBRARY_PATH=${CUDNN_PATH}:\$LD_LIBRARY_PATH"
+    # Add to ~/.bashrc if not already present
+    if ! grep -qF "$CUDNN_PATH" ~/.bashrc 2>/dev/null; then
+        echo "$PROFILE_LINE" >> ~/.bashrc
+    fi
+    # Apply for the current session
+    export LD_LIBRARY_PATH="${CUDNN_PATH}:${LD_LIBRARY_PATH}"
+    echo "cuDNN path set to: $CUDNN_PATH"
+else
+    echo "No bundled cuDNN found — skipping."
+fi
 # Verify installation
 echo ""
 echo "===================================="
 import torch
 import xformers
 import transformers
+print(f'  PyTorch:    {torch.__version__}')
+print(f'  xformers:   {xformers.__version__}')
+print(f'  Transformers: {transformers.__version__}')
+print(f'  CUDA available: {torch.cuda.is_available()}')
 if torch.cuda.is_available():
+    print(f'  CUDA version: {torch.version.cuda}')
+    props = torch.cuda.get_device_properties(0)
+    print(f'  GPU: {props.name} (sm_{props.major}{props.minor})')
 "
 echo ""
 echo "===================================="
 echo ""
 echo "Next steps:"
+echo "1. Run: python setup.py           (download pretrained base models)"
+echo "2. Run: python inference_example.py  (run Finnish TTS inference)"
+echo "3. Run: python train.py           (optional: start fine-tuning)"
+echo ""

requirements.txt CHANGED Viewed

@@ -1,22 +1,23 @@
-# Core PyTorch - Using 2.5.1 for stable xformers/flash-attn support
---extra-index-url https://download.pytorch.org/whl/cu124
-torch==2.5.1
-torchaudio==2.5.1
-torchvision==0.20.1
-# Core dependencies with pinned versions for stability
 transformers==4.46.3
-xformers==0.0.28.post3
-torchao==0.6.1
 diffusers==0.29.0
 peft==0.17.1
-# Chatterbox TTS dependencies
-# Note: chatterbox-tts itself is installed via install_dependencies.sh --no-deps
-# to avoid strict torch==2.6.0 conflict
 resemble-perth==1.0.1
 conformer==0.3.2
 s3tokenizer==0.3.0
 # Audio processing
 silero-vad==6.2.0
@@ -34,3 +35,4 @@ tensorboard
 omegaconf
 hf_transfer
 gdown

+# PyTorch is installed separately by install_dependencies.sh,
+# which auto-detects your GPU and picks the right CUDA build.
+# Do not pin torch/torchaudio/torchvision here.
+# Transformers stack
 transformers==4.46.3
 diffusers==0.29.0
 peft==0.17.1
+# xformers is also installed by install_dependencies.sh (version depends on GPU)
+# torchao
+torchao==0.6.1
+# Chatterbox TTS source is bundled under src/chatterbox_/.
+# These are the runtime deps it needs:
 resemble-perth==1.0.1
 conformer==0.3.2
 s3tokenizer==0.3.0
+huggingface_hub
 # Audio processing
 silero-vad==6.2.0
 omegaconf
 hf_transfer
 gdown
+requests