Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5, 2025

Commit

5bbaead

1 Parent(s): bcae560

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +79 -37

app.py CHANGED Viewed

@@ -362,6 +362,8 @@ class OptimizedAST:
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
@@ -401,29 +403,52 @@ class OptimizedAST:
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use longer context for AST - take from full audio if available
-            if full_audio is not None and len(full_audio) > self.sample_rate:
-                # Take 3-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
-                window_size = int(1.5 * self.sample_rate)  # 1.5 seconds each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure we have at least 1 second
-                if end_pos - start_pos < self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
-            # Ensure minimum length for AST
-            if len(audio_for_ast) < self.sample_rate:
-                audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
             # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
@@ -452,23 +477,33 @@ class OptimizedAST:
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
-                # Boost the probability if it's too low but there's clear audio content
                 if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
-                    speech_prob = min(speech_prob * 5, 0.8)  # Boost but cap at 0.8
             else:
-                # Fallback to energy-based detection
-                energy = np.sum(audio_for_ast ** 2)
-                speech_prob = min(energy * 20, 1.0)
-            return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
             # Enhanced fallback
             if len(audio) > 0:
-                energy = np.sum(audio ** 2)
-                probability = min(energy * 30, 1.0)  # More aggressive energy scaling
-                is_speech = energy > 0.002
             else:
                 probability = 0.0
                 is_speech = False
@@ -491,6 +526,15 @@ class AudioProcessor:
         self.window_size = 0.064
         self.hop_size = 0.032
         self.delay_compensation = 0.0
         self.correlation_threshold = 0.7
@@ -921,21 +965,24 @@ class VADDemo:
             selected_models = list(set([model_a, model_b]))
-            # Process each window individually for all models
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
                 for model_name in selected_models:
                     if model_name in self.models:
-                        # Special handling for AST - pass full audio for context
-                        if model_name == 'AST':
-                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
-                        else:
-                            result = self.models[model_name].predict(chunk, timestamp)
-                        result.is_speech = result.probability > threshold
-                        vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
@@ -990,10 +1037,6 @@ class VADDemo:
             traceback.print_exc()
             return None, f"❌ Error: {str(e)}", f"Error details: {traceback.format_exc()}"
-# Initialize demo
-print("🎤 Initializing VAD Demo...")
-demo_app = VADDemo()
 # ===== GRADIO INTERFACE =====
 def create_interface():
@@ -1053,7 +1096,7 @@ def create_interface():
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="PANNs",
                     label="Model B (Bottom Panel)"
                 )
@@ -1103,11 +1146,10 @@ def create_interface():
     return interface
 # Create and launch interface
 if __name__ == "__main__":
-    # Initialize demo
-    print("🎤 Initializing VAD Demo...")
-    demo_app = VADDemo()
     interface = create_interface()
     interface.launch(share=True, debug=False)

         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.prediction_cache = {}  # Cache para evitar recálculos
+        self.cache_window = 1.0  # Cachear resultados por segundo
         self.load_model()
     def load_model(self):
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
+            # Cache key based on timestamp rounded to cache window
+            cache_key = int(timestamp / self.cache_window)
+            # Check cache first
+            if cache_key in self.prediction_cache:
+                cached_result = self.prediction_cache[cache_key]
+                # Return cached result with updated timestamp
+                return VADResult(
+                    cached_result.probability,
+                    cached_result.is_speech,
+                    cached_result.model_name + " (cached)",
+                    time.time() - start_time,
+                    timestamp
+                )
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - preferably 2 seconds
+            if full_audio is not None and len(full_audio) >= 2 * self.sample_rate:
+                # Take 2-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
+                window_size = self.sample_rate  # 1 second each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 2 seconds
+                if end_pos - start_pos < 2 * self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + 2 * self.sample_rate)
+                    if end_pos - start_pos < 2 * self.sample_rate:
+                        start_pos = max(0, end_pos - 2 * self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
+            # Ensure minimum length for AST (2 seconds preferred, minimum 1 second)
+            min_samples = 2 * self.sample_rate  # 2 seconds
+            if len(audio_for_ast) < min_samples:
+                audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant')
+            # Truncate if too long (AST can handle up to ~10s, but we'll use 3s max for efficiency)
+            max_samples = 3 * self.sample_rate
+            if len(audio_for_ast) > max_samples:
+                audio_for_ast = audio_for_ast[:max_samples]
             # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
+                # Apply more reasonable thresholding for AST
                 if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
+                    speech_prob = min(speech_prob * 3, 0.7)  # Moderate boost, cap at 0.7
             else:
+                # Fallback to energy-based detection with higher threshold
+                energy = np.sum(audio_for_ast ** 2) / len(audio_for_ast)  # Normalize by length
+                speech_prob = min(energy * 50, 1.0)
+            result = VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
+            # Cache the result
+            self.prediction_cache[cache_key] = result
+            # Clean old cache entries (keep only last 10 seconds)
+            cache_keys_to_remove = [k for k in self.prediction_cache.keys() if k < cache_key - 10]
+            for k in cache_keys_to_remove:
+                del self.prediction_cache[k]
+            return result
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
             # Enhanced fallback
             if len(audio) > 0:
+                energy = np.sum(audio ** 2) / len(audio)  # Normalize by length
+                probability = min(energy * 100, 1.0)  # More conservative scaling
+                is_speech = energy > 0.001  # Lower threshold for fallback
             else:
                 probability = 0.0
                 is_speech = False
         self.window_size = 0.064
         self.hop_size = 0.032
+        # Model-specific hop sizes for efficiency
+        self.model_hop_sizes = {
+            "Silero-VAD": 0.032,
+            "WebRTC-VAD": 0.03,
+            "E-PANNs": 1.0,
+            "PANNs": 1.0,
+            "AST": 1.0  # Process AST only once per second
+        }
         self.delay_compensation = 0.0
         self.correlation_threshold = 0.7
             selected_models = list(set([model_a, model_b]))
+            # Process each window with model-specific hop sizes for efficiency
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
                 for model_name in selected_models:
                     if model_name in self.models:
+                        # Check if we should process this model at this timestamp
+                        model_hop = self.processor.model_hop_sizes.get(model_name, self.processor.hop_size)
+                        if i % int(model_hop * self.processor.sample_rate) == 0:
+                            # Special handling for AST - pass full audio for context
+                            if model_name == 'AST':
+                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
+                            else:
+                                result = self.models[model_name].predict(chunk, timestamp)
+                            result.is_speech = result.probability > threshold
+                            vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
             traceback.print_exc()
             return None, f"❌ Error: {str(e)}", f"Error details: {traceback.format_exc()}"
 # ===== GRADIO INTERFACE =====
 def create_interface():
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="AST",
                     label="Model B (Bottom Panel)"
                 )
     return interface
+# Initialize demo only once
+demo_app = VADDemo()
 # Create and launch interface
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(share=True, debug=False)