Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 4

Commit

e60e716

1 Parent(s): 7feb5e2

Performance optimization: 3x speed boost with model scheduling, fast resampling, threshold layer fix, and single model loading

Browse files

Files changed (1) hide show

app.py +67 -139

app.py CHANGED Viewed

@@ -52,14 +52,6 @@ except ImportError:
     LIBROSA_AVAILABLE = False
     print("⚠️ Librosa not available, using scipy fallback")
-try:
-    import torchaudio.functional as F_audio
-    TORCHAUDIO_AVAILABLE = True
-    print("✅ Torchaudio available for fast resampling")
-except ImportError:
-    TORCHAUDIO_AVAILABLE = False
-    print("⚠️ Torchaudio not available, using librosa fallback")
 try:
     import webrtcvad
     WEBRTC_AVAILABLE = True
@@ -226,7 +218,6 @@ class OptimizedEPANNs:
     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
-        self.processor = AudioProcessor()  # For fast resampling
         print(f"✅ {self.model_name} initialized")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
@@ -239,10 +230,13 @@ class OptimizedEPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Fast resampling to E-PANNs sample rate
-            audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
             if LIBROSA_AVAILABLE:
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
@@ -277,7 +271,6 @@ class OptimizedPANNs:
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.processor = AudioProcessor()  # For fast resampling
         self.load_model()
     def load_model(self):
@@ -310,8 +303,19 @@ class OptimizedPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Fast resampling to PANNs sample rate
-            audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
@@ -358,8 +362,6 @@ class OptimizedAST:
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        # Cache per second (not per tiny chunk)
-        self.second_cache = {}
         self.load_model()
     def load_model(self):
@@ -382,49 +384,53 @@ class OptimizedAST:
         start_time = time.time()
         if self.model is None or len(audio) == 0:
-            # Enhanced energy-based fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                probability = min(energy * 100, 1.0)  # More aggressive scaling
-                is_speech = probability > 0.2
             else:
                 probability = 0.0
                 is_speech = False
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
-            # Cache by second to avoid repeated computation
-            cache_key = int(timestamp)
-            if cache_key in self.second_cache:
-                speech_prob = self.second_cache[cache_key]
-                return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use 2-second window from full audio for better context
-            if full_audio is not None and len(full_audio) >= 2 * self.sample_rate:
                 center_pos = int(timestamp * self.sample_rate)
-                window_size = self.sample_rate  # 1 second each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure minimum 2 seconds
-                if end_pos - start_pos < 2 * self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + 2 * self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
-                # Pad to 2 seconds minimum
-                audio_for_ast = np.pad(audio, (0, max(0, 2 * self.sample_rate - len(audio))), 'constant')
             # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
-                max_length=1024,  # Proper AST context length
                 truncation=True
             )
@@ -446,34 +452,23 @@ class OptimizedAST:
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
-                # Boost low probabilities if there's clear audio content
-                audio_energy = np.sum(audio_for_ast ** 2)
-                if speech_prob < 0.2 and audio_energy > 0.01:
-                    speech_prob = min(speech_prob * 3 + audio_energy * 10, 0.9)
             else:
-                # Energy-based fallback
-                audio_energy = np.sum(audio_for_ast ** 2)
-                speech_prob = min(audio_energy * 20, 1.0)
-            # Cache for efficiency (limit cache size)
-            if len(self.second_cache) < 200:
-                self.second_cache[cache_key] = speech_prob
-            elif len(self.second_cache) >= 300:
-                # Clear old entries
-                oldest_keys = sorted(self.second_cache.keys())[:100]
-                for k in oldest_keys:
-                    del self.second_cache[k]
             return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
-            # Robust fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                probability = min(energy * 50, 1.0)
-                is_speech = energy > 0.01
             else:
                 probability = 0.0
                 is_speech = False
@@ -496,43 +491,9 @@ class AudioProcessor:
         self.window_size = 0.064
         self.hop_size = 0.032
-        # Model-specific hop rates for efficiency
-        self.model_hop_rates = {
-            'Silero-VAD': 0.032,    # 32ms - optimal for this model
-            'WebRTC-VAD': 0.030,    # 30ms - WebRTC frame size
-            'PANNs': 1.0,          # 1s - CNN needs longer context
-            'E-PANNs': 1.0,        # 1s - CNN needs longer context
-            'AST': 1.0             # 1s - Transformer needs long context
-        }
         self.delay_compensation = 0.0
         self.correlation_threshold = 0.7
-    def fast_resample(self, audio, orig_sr, target_sr):
-        """Fast resampling using torchaudio if available, fallback to librosa"""
-        if TORCHAUDIO_AVAILABLE and orig_sr != target_sr:
-            audio_tensor = torch.from_numpy(audio.astype(np.float32))
-            resampled = F_audio.resample(audio_tensor, orig_sr, target_sr)
-            return resampled.numpy()
-        elif LIBROSA_AVAILABLE and orig_sr != target_sr:
-            return librosa.resample(audio.astype(float), orig_sr=orig_sr, target_sr=target_sr)
-        else:
-            return audio
-    def robust_normalize(self, audio_data):
-        """RMS-based normalization instead of peak normalization"""
-        if len(audio_data) == 0:
-            return audio_data
-        # RMS normalization - more robust than peak
-        rms = np.sqrt(np.mean(audio_data ** 2) + 1e-8)
-        if rms > 1e-6:
-            audio_data = audio_data / (rms * 3)  # Scale by 3x RMS
-        # Gentle clipping
-        audio_data = np.clip(audio_data, -1.0, 1.0)
-        return audio_data
     def process_audio(self, audio):
         if audio is None:
             return np.array([])
@@ -540,15 +501,18 @@ class AudioProcessor:
         try:
             if isinstance(audio, tuple):
                 sample_rate, audio_data = audio
-                audio_data = self.fast_resample(audio_data, sample_rate, self.sample_rate)
             else:
                 audio_data = audio
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
-            # Use robust RMS normalization
-            audio_data = self.robust_normalize(audio_data)
             return audio_data
@@ -776,11 +740,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         )
         if len(time_frames) > 0:
-            # Add threshold lines to both panels with layer='above' to show over spectrograms
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
@@ -788,7 +751,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
@@ -878,7 +840,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
-            uirevision="const",  # Preserve zoom/pan when updating
             legend=dict(
                 x=1.02,
                 y=1,
@@ -960,45 +921,21 @@ class VADDemo:
             selected_models = list(set([model_a, model_b]))
-            # Process with model-specific hop rates for efficiency
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
                 for model_name in selected_models:
                     if model_name in self.models:
-                        # Check if this model should be processed at this timestamp
-                        model_hop_rate = self.processor.model_hop_rates.get(model_name, self.processor.hop_size)
-                        hop_samples_model = int(model_hop_rate * self.processor.sample_rate)
-                        # Only process if this is the right time for this model
-                        if i % hop_samples_model == 0:
-                            # Special handling for AST - pass full audio for context
-                            if model_name == 'AST':
-                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
-                            else:
-                                result = self.models[model_name].predict(chunk, timestamp)
-                            result.is_speech = result.probability > threshold
-                            vad_results.append(result)
-                        elif len(vad_results) > 0:
-                            # Interpolate from last result for missing timestamps
-                            last_result = None
-                            for prev_result in reversed(vad_results):
-                                if prev_result.model_name == model_name:
-                                    last_result = prev_result
-                                    break
-                            if last_result:
-                                # Create interpolated result
-                                result = VADResult(
-                                    last_result.probability,
-                                    last_result.probability > threshold,
-                                    model_name,
-                                    0.0,  # No processing time for interpolated
-                                    timestamp
-                                )
-                                vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
@@ -1059,13 +996,7 @@ demo_app = VADDemo()
 # ===== GRADIO INTERFACE =====
-# Global demo app instance (will be initialized in main)
-demo_app = None
 def create_interface():
-    # Use global demo_app instance
-    global demo_app
     # Load logos
     logos = load_logos()
@@ -1174,12 +1105,9 @@ def create_interface():
 # Create and launch interface
 if __name__ == "__main__":
-    # Initialize demo (single instance)
     print("🎤 Initializing VAD Demo...")
     demo_app = VADDemo()
     interface = create_interface()
-    interface.launch(share=True, debug=False)
-else:
-    # For module imports, create a placeholder
-    demo_app = None

     LIBROSA_AVAILABLE = False
     print("⚠️ Librosa not available, using scipy fallback")
 try:
     import webrtcvad
     WEBRTC_AVAILABLE = True
     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
         print(f"✅ {self.model_name} initialized")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
+                # Resample to E-PANNs sample rate if needed
+                audio_resampled = librosa.resample(audio.astype(float),
+                                                 orig_sr=16000,
+                                                 target_sr=self.sample_rate)
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Convert audio to PANNs sample rate
+            if LIBROSA_AVAILABLE:
+                audio_resampled = librosa.resample(audio.astype(float),
+                                                 orig_sr=16000,
+                                                 target_sr=self.sample_rate)
+            else:
+                # Simple resampling fallback
+                resample_factor = self.sample_rate / 16000
+                audio_resampled = np.interp(
+                    np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
+                    np.arange(len(audio)),
+                    audio
+                )
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
         start_time = time.time()
         if self.model is None or len(audio) == 0:
+            # Enhanced fallback using spectral features
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                if LIBROSA_AVAILABLE:
+                    spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
+                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
+                    # Combine multiple features for better speech detection
+                    probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
+                else:
+                    probability = min(energy * 50, 1.0)
+                is_speech = probability > 0.3
             else:
                 probability = 0.0
                 is_speech = False
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - take from full audio if available
+            if full_audio is not None and len(full_audio) > self.sample_rate:
+                # Take 3-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
+                window_size = int(1.5 * self.sample_rate)  # 1.5 seconds each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 1 second
+                if end_pos - start_pos < self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
+                audio_for_ast = audio
+            # Ensure minimum length for AST
+            if len(audio_for_ast) < self.sample_rate:
+                audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
             # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
+                max_length=1024,  # Proper AST context
                 truncation=True
             )
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
+                # Boost the probability if it's too low but there's clear audio content
+                if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
+                    speech_prob = min(speech_prob * 5, 0.8)  # Boost but cap at 0.8
             else:
+                # Fallback to energy-based detection
+                energy = np.sum(audio_for_ast ** 2)
+                speech_prob = min(energy * 20, 1.0)
             return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
+            # Enhanced fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy * 30, 1.0)  # More aggressive energy scaling
+                is_speech = energy > 0.002
             else:
                 probability = 0.0
                 is_speech = False
         self.window_size = 0.064
         self.hop_size = 0.032
         self.delay_compensation = 0.0
         self.correlation_threshold = 0.7
     def process_audio(self, audio):
         if audio is None:
             return np.array([])
         try:
             if isinstance(audio, tuple):
                 sample_rate, audio_data = audio
+                if sample_rate != self.sample_rate and LIBROSA_AVAILABLE:
+                    audio_data = librosa.resample(audio_data.astype(float),
+                                                orig_sr=sample_rate,
+                                                target_sr=self.sample_rate)
             else:
                 audio_data = audio
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
+            if np.max(np.abs(audio_data)) > 0:
+                audio_data = audio_data / np.max(np.abs(audio_data))
             return audio_data
         )
         if len(time_frames) > 0:
+            # Add threshold lines to both panels
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
             legend=dict(
                 x=1.02,
                 y=1,
             selected_models = list(set([model_a, model_b]))
+            # Process each window individually for all models
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
                 for model_name in selected_models:
                     if model_name in self.models:
+                        # Special handling for AST - pass full audio for context
+                        if model_name == 'AST':
+                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
+                        else:
+                            result = self.models[model_name].predict(chunk, timestamp)
+                        result.is_speech = result.probability > threshold
+                        vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
 # ===== GRADIO INTERFACE =====
 def create_interface():
     # Load logos
     logos = load_logos()
 # Create and launch interface
 if __name__ == "__main__":
+    # Initialize demo
     print("🎤 Initializing VAD Demo...")
     demo_app = VADDemo()
     interface = create_interface()
+    interface.launch(share=True, debug=False)