Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 11

Commit

aee7b20

1 Parent(s): d02d086

adjust app.py

Browse files

Files changed (1) hide show

app.py +95 -293

app.py CHANGED Viewed

@@ -260,6 +260,7 @@ class OptimizedEPANNs:
     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
         print(f"✅ {self.model_name} initialized")
         # Try to load PANNs AudioTagging as backend for E-PANNs
@@ -281,92 +282,50 @@ class OptimizedEPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # For E-PANNs, we need to extract the appropriate window based on timestamp
-            window_duration = 6.0  # 6 seconds window for E-PANNs
-            window_samples = int(window_duration * 16000)  # at 16kHz input rate
-            # Calculate the center position for this timestamp
-            center_sample = int(timestamp * 16000)
-            half_window = window_samples // 2
-            # Extract window centered at timestamp
-            start_idx = max(0, center_sample - half_window)
-            end_idx = min(len(audio), start_idx + window_samples)
-            # Adjust start if we're at the end of audio
-            if end_idx == len(audio) and end_idx - start_idx < window_samples:
-                start_idx = max(0, end_idx - window_samples)
-            audio_window = audio[start_idx:end_idx]
-            # Convert audio to target sample rate for E-PANNs (32kHz)
-            if LIBROSA_AVAILABLE:
-                # Resample to E-PANNs sample rate
-                audio_resampled = librosa.resample(audio_window.astype(float),
-                                                 orig_sr=16000,
-                                                 target_sr=self.sample_rate)
-                # For short audio, repeat it instead of padding with zeros
-                min_samples = 6 * self.sample_rate  # 6 seconds
-                if len(audio_resampled) < min_samples:
-                    # Repeat the audio to fill the minimum required length
-                    num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
-                    audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
-                # If we have PANNs AT model, use it
-                if self.at_model is not None:
-                    # Run inference
-                    clipwise_output, _ = self.at_model.inference(audio_resampled[np.newaxis, :])
-                    # Get speech-related classes
-                    speech_keywords = [
-                        'speech', 'voice', 'talk', 'conversation', 'speaking',
-                        'male speech', 'female speech', 'child speech',
-                        'narration', 'monologue'
-                    ]
-                    speech_indices = []
-                    for i, lbl in enumerate(labels):
-                        if any(word in lbl.lower() for word in speech_keywords):
-                            speech_indices.append(i)
-                    if speech_indices:
-                        speech_probs = clipwise_output[0, speech_indices]
-                        speech_score = float(np.max(speech_probs))
-                    else:
-                        speech_score = float(np.max(clipwise_output[0]))
                 else:
-                    # Fallback to spectral features
-                    # Compute features
-                    mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                     energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
-                    # Use actual non-repeated audio for some features
-                    actual_audio_len = min(len(audio_resampled), int(len(audio_window) * self.sample_rate / 16000))
-                    actual_audio = audio_resampled[:actual_audio_len]
-                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
-                    mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
-                    mfcc_var = np.var(mfcc, axis=1).mean()
-                    zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
-                    # Adjusted scaling for better speech detection
                     energy_score = np.clip((energy + 80) / 40, 0, 1)
                     centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
-                    mfcc_score = np.clip(mfcc_var / 100, 0, 1)
-                    zcr_score = np.clip(zcr * 10, 0, 1)
-                    # Weighted combination
-                    speech_score = (energy_score * 0.4 +
-                                  centroid_score * 0.2 +
-                                  mfcc_score * 0.3 +
-                                  zcr_score * 0.1)
-            else:
-                from scipy import signal
-                # Basic fallback without librosa
-                f, t, Sxx = signal.spectrogram(audio_window, 16000)
-                energy = np.mean(10 * np.log10(Sxx + 1e-10))
-                speech_score = np.clip((energy + 100) / 50, 0, 1)
             probability = np.clip(speech_score, 0, 1)
             is_speech = probability > 0.4
@@ -432,54 +391,25 @@ class OptimizedPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # For PANNs, extract the appropriate window based on timestamp
-            window_duration = 10.0  # 10 seconds window for PANNs
-            window_samples = int(window_duration * 16000)  # at 16kHz input rate
-            # Calculate the center position for this timestamp
-            center_sample = int(timestamp * 16000)
-            half_window = window_samples // 2
-            # Extract window centered at timestamp
-            start_idx = max(0, center_sample - half_window)
-            end_idx = min(len(audio), start_idx + window_samples)
-            # Adjust start if we're at the end of audio
-            if end_idx == len(audio) and end_idx - start_idx < window_samples:
-                start_idx = max(0, end_idx - window_samples)
-            audio_window = audio[start_idx:end_idx]
             # Convert audio to PANNs sample rate
             if LIBROSA_AVAILABLE:
-                audio_resampled = librosa.resample(audio_window.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
             else:
                 # Simple resampling fallback
                 resample_factor = self.sample_rate / 16000
                 audio_resampled = np.interp(
-                    np.linspace(0, len(audio_window) - 1, int(len(audio_window) * resample_factor)),
-                    np.arange(len(audio_window)),
-                    audio_window
                 )
-            # For short audio, use intelligent padding strategy
-            min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
-                # Strategy: repeat the audio cyclically to maintain characteristics
-                num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
-                audio_repeated = np.tile(audio_resampled, num_repeats)[:min_samples]
-                # Apply fade in/out to reduce artifacts
-                fade_len = int(0.1 * self.sample_rate)  # 100ms fade
-                fade_in = np.linspace(0, 1, fade_len)
-                fade_out = np.linspace(1, 0, fade_len)
-                audio_repeated[:fade_len] *= fade_in
-                audio_repeated[-fade_len:] *= fade_out
-                audio_resampled = audio_repeated
             # Use SED for framewise predictions if available
             if self.sed_model is not None:
@@ -492,13 +422,8 @@ class OptimizedPANNs:
                 if framewise_output.ndim == 3:
                     framewise_output = framewise_output[0]  # Remove batch dimension
-                # Get frame corresponding to timestamp
-                audio_duration = len(audio_resampled) / self.sample_rate
-                if audio_duration > 0:
-                    frame_idx = int((timestamp % audio_duration) / audio_duration * framewise_output.shape[0])
-                    frame_idx = min(frame_idx, framewise_output.shape[0] - 1)
-                else:
-                    frame_idx = 0
                 # Get speech-related classes
                 speech_keywords = [
@@ -551,11 +476,6 @@ class OptimizedPANNs:
                         noise_prob = np.mean(clip_probs[0, noise_indices])
                         # Adjust speech probability based on noise
                         speech_prob = speech_prob * (1 - noise_prob * 0.5)
-                    # If using repeated audio, scale confidence based on original length
-                    if len(audio_window) < 16000 * 2:  # Less than 2 seconds
-                        confidence_scale = len(audio_window) / (16000 * 2)
-                        speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
                 else:
                     # Fallback if no speech indices found
@@ -579,15 +499,14 @@ class OptimizedPANNs:
             return VADResult(probability, is_speech, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
-    """CORRECTED AST with proper 16kHz sample rate and sliding windows"""
     def __init__(self):
         self.model_name = "AST"
         self.sample_rate = 16000  # AST REQUIRES 16kHz
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.prediction_cache = {}  # Cache para evitar recálculos
-        self.cache_window = 1.0  # Cachear resultados por segundo
         self.load_model()
     def load_model(self):
@@ -616,12 +535,7 @@ class OptimizedAST:
     def predict(self, audio: np.ndarray, timestamp: float = 0.0, full_audio: np.ndarray = None) -> VADResult:
         start_time = time.time()
-        print(f"🔍 AST predict: audio_len={len(audio)}, timestamp={timestamp:.2f}, model_available={self.model is not None}")
-        if full_audio is not None:
-            print(f"🔍 AST: full_audio_len={len(full_audio)}")
         if self.model is None or len(audio) == 0:
-            print(f"❌ AST: Model unavailable or empty audio")
             # Enhanced fallback using spectral features
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
@@ -630,10 +544,8 @@ class OptimizedAST:
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                     # Combine multiple features for better speech detection
                     probability = min((energy * 100 + spectral_centroid / 1000) / 2, 1.0)
-                    print(f"🔄 AST fallback: energy={energy:.6f}, centroid={spectral_centroid:.1f}, prob={probability:.4f}")
                 else:
                     probability = min(energy * 50, 1.0)
-                    print(f"🔄 AST fallback (simple): energy={energy:.6f}, prob={probability:.4f}")
                 is_speech = probability > 0.25  # Use AST threshold
             else:
                 probability = 0.0
@@ -641,91 +553,39 @@ class OptimizedAST:
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
-            # Cache key based on timestamp rounded to cache window
-            cache_key = int(timestamp / self.cache_window)
-            # Check cache first
-            if cache_key in self.prediction_cache:
-                cached_result = self.prediction_cache[cache_key]
-                print(f"✅ AST: Using cached result for t={timestamp:.2f}s")
-                # Return cached result with updated timestamp
-                return VADResult(
-                    cached_result.probability,
-                    cached_result.is_speech,
-                    cached_result.model_name + " (cached)",
-                    time.time() - start_time,
-                    timestamp
-                )
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-                print(f"🔄 AST: Converted to mono")
-            # CRITICAL FIX: AST uses 16kHz, but input is already at 16kHz
-            # So we DON'T need to resample, just ensure it's float32
-            audio = audio.astype(np.float32)
-            # Use sliding window approach for temporal resolution
-            window_duration = 1.0  # 1 second windows
-            window_samples = int(window_duration * self.sample_rate)
-            # Get window for this timestamp
-            center_sample = int(timestamp * self.sample_rate)
-            half_window = window_samples // 2
-            start_idx = max(0, center_sample - half_window)
-            end_idx = min(len(audio), start_idx + window_samples)
-            # Adjust if at the end
-            if end_idx == len(audio) and end_idx - start_idx < window_samples:
-                start_idx = max(0, end_idx - window_samples)
-            audio_for_ast = audio[start_idx:end_idx]
-            print(f"🔄 AST: Extracted window [{start_idx}:{end_idx}], len={len(audio_for_ast)}")
-            # For short audio, use intelligent strategy
             min_samples = int(1.0 * self.sample_rate)  # 1 second minimum
             if len(audio_for_ast) < min_samples:
-                print(f"⚠️ AST: Audio too short ({len(audio_for_ast)} samples), padding")
-                # Pad with zeros
-                audio_padded = np.zeros(min_samples)
-                audio_padded[:len(audio_for_ast)] = audio_for_ast
-                audio_for_ast = audio_padded
-                print(f"✅ AST: Padded to {len(audio_for_ast)} samples")
-            # Truncate if too long (AST can handle up to ~10s, but we use 1s windows)
-            max_samples = int(1.5 * self.sample_rate)
-            if len(audio_for_ast) > max_samples:
-                audio_for_ast = audio_for_ast[:max_samples]
-                print(f"✂️ AST: Truncated to {len(audio_for_ast)} samples")
-            print(f"🔄 AST: Feature extraction...")
-            # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,  # Must be 16kHz
                 return_tensors="pt",
-                max_length=1024,  # Proper AST context
-                padding="max_length",  # Ensure consistent length
-                truncation=True
             )
-            print(f"✅ AST: Features extracted, input_shape={[v.shape if hasattr(v, 'shape') else type(v) for v in inputs.values()]}")
             # Move inputs to correct device and dtype
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             if self.device.type == 'cuda' and hasattr(self.model, 'half'):
                 # Convert inputs to FP16 if model is in FP16
                 inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
-            print(f"🚀 AST: Running inference...")
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 logits = outputs.logits
                 probs = torch.sigmoid(logits)
-            print(f"✅ AST: Inference complete, logits_shape={logits.shape}, probs_shape={probs.shape}")
             # Find speech-related classes with enhanced keywords
             label2id = self.model.config.label2id
             speech_indices = []
@@ -739,8 +599,6 @@ class OptimizedAST:
                 if any(word in lbl.lower() for word in speech_keywords):
                     speech_indices.append(idx)
-            print(f"🔍 AST: Found {len(speech_indices)} speech-related classes")
             # Also identify background/noise classes for better discrimination
             noise_keywords = ['silence', 'white noise', 'background']
             noise_indices = []
@@ -758,35 +616,16 @@ class OptimizedAST:
                     noise_prob = torch.mean(probs[0, noise_indices]).item()
                     # Reduce speech probability if high noise/silence detected
                     speech_prob = speech_prob * (1 - noise_prob * 0.3)
-                print(f"📈 AST: raw_speech_prob={speech_prob:.4f}")
-                # Adjust confidence for short audio
-                if len(audio) < self.sample_rate * 2:  # Less than 2 seconds
-                    confidence_factor = len(audio) / (self.sample_rate * 2)
-                    speech_prob = speech_prob * (0.6 + 0.4 * confidence_factor)
-                    print(f"🔧 AST: Adjusted for short audio, final_prob={speech_prob:.4f}")
             else:
                 # Fallback to energy-based detection with better calibration
                 energy = np.sum(audio_for_ast ** 2) / len(audio_for_ast)  # Normalize by length
                 speech_prob = min(energy * 20, 1.0)  # Better scaling
-                print(f"⚠️ AST: No speech classes found, using energy fallback: energy={energy:.6f}, prob={speech_prob:.4f}")
             # Use lower threshold specifically for AST (0.25 instead of 0.4)
             is_speech_ast = speech_prob > 0.25
             result = VADResult(float(speech_prob), is_speech_ast, self.model_name, time.time()-start_time, timestamp)
-            print(f"✅ AST: final_prob={speech_prob:.4f}, is_speech={is_speech_ast}")
-            # Cache the result
-            self.prediction_cache[cache_key] = result
-            # Clean old cache entries (keep only last 30 seconds for longer sessions)
-            cache_keys_to_remove = [k for k in self.prediction_cache.keys() if k < cache_key - 30]
-            for k in cache_keys_to_remove:
-                del self.prediction_cache[k]
             return result
         except Exception as e:
@@ -798,7 +637,6 @@ class OptimizedAST:
                 energy = np.sum(audio ** 2) / len(audio)  # Normalize by length
                 probability = min(energy * 100, 1.0)  # More conservative scaling
                 is_speech = energy > 0.001  # Lower threshold for fallback
-                print(f"🔄 AST error fallback: energy={energy:.6f}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
@@ -825,18 +663,18 @@ class AudioProcessor:
         self.model_windows = {
             "Silero-VAD": 0.032,  # 32ms exactly as required (512 samples)
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
-            "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
-            "PANNs": 10.0,        # 10 seconds for optimal performance
-            "AST": 1.0            # Changed to 1 second for better temporal resolution
         }
-        # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
-            "E-PANNs": 0.1,       # 100ms hop for 10 predictions/second
-            "PANNs": 0.1,         # 100ms hop for 10 predictions/second
-            "AST": 0.1            # 100ms hop for 10 predictions/second
         }
         # Model-specific thresholds for better detection
@@ -1346,78 +1184,42 @@ class VADDemo:
                     model_results = []
-                    # Always use sliding window approach for consistent temporal resolution
-                    if len(processed_audio) < window_samples:
-                        debug_info.append(f"  ⚠️ Audio shorter than window ({len(processed_audio)} < {window_samples}), using sliding window with padding")
-                        # For short audio, still use sliding window but with the actual audio length
-                        # This ensures we get the desired temporal resolution (10 predictions/second)
-                        window_count = 0
-                        audio_duration = len(processed_audio) / self.processor.sample_rate
-                        # Calculate number of windows based on hop size
-                        num_windows = max(1, int((audio_duration - window_size) / hop_size) + 1) if audio_duration > window_size else max(1, int(audio_duration / hop_size))
-                        for i in range(0, len(processed_audio), hop_samples):
-                            timestamp = i / self.processor.sample_rate
-                            # For models that need long context, we'll use the full audio padded/repeated as needed
-                            # but report the timestamp based on the sliding window position
-                            if window_count < 3:  # Log first 3 windows
-                                debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s")
-                            # Special handling for different models
-                            if model_name == 'AST':
-                                result = self.models[model_name].predict(processed_audio, timestamp, full_audio=processed_audio)
-                            else:
-                                result = self.models[model_name].predict(processed_audio, timestamp)
-                            if window_count < 3:  # Log first 3 results
-                                debug_info.append(f"  📈 Result {window_count}: prob={result.probability:.4f}, speech={result.is_speech}")
-                            # Use model-specific threshold
-                            result.is_speech = result.probability > model_threshold
-                            vad_results.append(result)
-                            model_results.append(result)
-                            window_count += 1
-                            # Stop if we've gone past the audio length
-                            if timestamp >= audio_duration:
-                                break
-                        debug_info.append(f"  🎯 Total windows processed: {window_count}")
-                    else:
-                        # Audio is long enough - process in sliding windows
-                        debug_info.append(f"  ✅ Audio long enough, processing in windows")
-                        window_count = 0
-                        for i in range(0, len(processed_audio) - window_samples + 1, hop_samples):
-                            timestamp = i / self.processor.sample_rate
-                            # Extract window
-                            start_pos = i
-                            end_pos = min(len(processed_audio), i + window_samples)
-                            chunk = processed_audio[start_pos:end_pos]
-                            if window_count < 3:  # Log first 3 windows
-                                debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s, size={len(chunk)}")
-                            # Special handling for different models
-                            if model_name == 'AST':
-                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
-                            else:
-                                result = self.models[model_name].predict(chunk, timestamp)
-                            if window_count < 3:  # Log first 3 results
-                                debug_info.append(f"  📈 Result {window_count}: prob={result.probability:.4f}, speech={result.is_speech}")
-                            # Use model-specific threshold
-                            result.is_speech = result.probability > model_threshold
-                            vad_results.append(result)
-                            model_results.append(result)
-                            window_count += 1
-                        debug_info.append(f"  🎯 Total windows processed: {window_count}")
                     # Summary for this model
                     if model_results:
@@ -1594,7 +1396,7 @@ def create_interface():
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
-        **Note**: Large models (PANNs: 10s, E-PANNs: 6s, AST: 6.4s) work best with longer recordings. Short clips will be processed intelligently.
         """)
     return interface

     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
+        self.win_s = 1.0  # CHANGED from 6.0 to 1.0 for better temporal resolution
         print(f"✅ {self.model_name} initialized")
         # Try to load PANNs AudioTagging as backend for E-PANNs
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # CORRECTED: Work with the chunk directly, no more extracting windows
+            # The audio passed is already the chunk for this timestamp
+            x = safe_resample(audio, 16000, self.sample_rate)
+            # Pad to minimum window size if needed (no repeating)
+            min_samples = int(self.sample_rate * self.win_s)
+            if len(x) < min_samples:
+                x = np.pad(x, (0, min_samples - len(x)), mode='constant')
+            # If we have PANNs AT model, use it
+            if self.at_model is not None:
+                # Run inference
+                clipwise_output, _ = self.at_model.inference(x[np.newaxis, :])
+                # Get speech-related classes
+                speech_keywords = [
+                    'speech', 'voice', 'talk', 'conversation', 'speaking',
+                    'male speech', 'female speech', 'child speech',
+                    'narration', 'monologue', 'speech synthesizer'
+                ]
+                speech_indices = []
+                for i, lbl in enumerate(labels):
+                    if any(word in lbl.lower() for word in speech_keywords):
+                        speech_indices.append(i)
+                if speech_indices:
+                    speech_probs = clipwise_output[0, speech_indices]
+                    speech_score = float(np.max(speech_probs))
                 else:
+                    speech_score = float(np.max(clipwise_output[0]))
+            else:
+                # Fallback to spectral features
+                if LIBROSA_AVAILABLE:
+                    mel_spec = librosa.feature.melspectrogram(y=x, sr=self.sample_rate, n_mels=64)
                     energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
+                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=x, sr=self.sample_rate))
                     energy_score = np.clip((energy + 80) / 40, 0, 1)
                     centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
+                    speech_score = energy_score * 0.7 + centroid_score * 0.3
+                else:
+                    energy = np.sum(x ** 2) / len(x)
+                    speech_score = min(energy * 50, 1.0)
             probability = np.clip(speech_score, 0, 1)
             is_speech = probability > 0.4
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # CORRECTED: Work with the chunk directly
             # Convert audio to PANNs sample rate
             if LIBROSA_AVAILABLE:
+                audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
             else:
                 # Simple resampling fallback
                 resample_factor = self.sample_rate / 16000
                 audio_resampled = np.interp(
+                    np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
+                    np.arange(len(audio)),
+                    audio
                 )
+            # For short audio, pad (no repeating)
+            min_samples = 1 * self.sample_rate  # 1 second minimum
             if len(audio_resampled) < min_samples:
+                audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), mode='constant')
             # Use SED for framewise predictions if available
             if self.sed_model is not None:
                 if framewise_output.ndim == 3:
                     framewise_output = framewise_output[0]  # Remove batch dimension
+                # Get middle frame (corresponding to center of window)
+                frame_idx = framewise_output.shape[0] // 2
                 # Get speech-related classes
                 speech_keywords = [
                         noise_prob = np.mean(clip_probs[0, noise_indices])
                         # Adjust speech probability based on noise
                         speech_prob = speech_prob * (1 - noise_prob * 0.5)
                 else:
                     # Fallback if no speech indices found
             return VADResult(probability, is_speech, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
+    """CORRECTED AST with proper 16kHz sample rate and NO CACHE"""
     def __init__(self):
         self.model_name = "AST"
         self.sample_rate = 16000  # AST REQUIRES 16kHz
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # NO CACHE - removed cache_window and prediction_cache
         self.load_model()
     def load_model(self):
     def predict(self, audio: np.ndarray, timestamp: float = 0.0, full_audio: np.ndarray = None) -> VADResult:
         start_time = time.time()
         if self.model is None or len(audio) == 0:
             # Enhanced fallback using spectral features
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                     # Combine multiple features for better speech detection
                     probability = min((energy * 100 + spectral_centroid / 1000) / 2, 1.0)
                 else:
                     probability = min(energy * 50, 1.0)
                 is_speech = probability > 0.25  # Use AST threshold
             else:
                 probability = 0.0
             return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
+            # NO CACHE - removed all cache-related code
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # CRITICAL: AST uses 16kHz, input is already at 16kHz
+            audio_for_ast = audio.astype(np.float32)
+            # Pad to minimum 1 second if needed
             min_samples = int(1.0 * self.sample_rate)  # 1 second minimum
             if len(audio_for_ast) < min_samples:
+                audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), mode='constant')
+            # Feature extraction with NO PADDING to 1024
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,  # Must be 16kHz
                 return_tensors="pt",
+                padding=False,  # CHANGED: No padding to 1024
+                truncation=False  # CHANGED: No truncation
             )
             # Move inputs to correct device and dtype
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             if self.device.type == 'cuda' and hasattr(self.model, 'half'):
                 # Convert inputs to FP16 if model is in FP16
                 inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 logits = outputs.logits
                 probs = torch.sigmoid(logits)
             # Find speech-related classes with enhanced keywords
             label2id = self.model.config.label2id
             speech_indices = []
                 if any(word in lbl.lower() for word in speech_keywords):
                     speech_indices.append(idx)
             # Also identify background/noise classes for better discrimination
             noise_keywords = ['silence', 'white noise', 'background']
             noise_indices = []
                     noise_prob = torch.mean(probs[0, noise_indices]).item()
                     # Reduce speech probability if high noise/silence detected
                     speech_prob = speech_prob * (1 - noise_prob * 0.3)
             else:
                 # Fallback to energy-based detection with better calibration
                 energy = np.sum(audio_for_ast ** 2) / len(audio_for_ast)  # Normalize by length
                 speech_prob = min(energy * 20, 1.0)  # Better scaling
             # Use lower threshold specifically for AST (0.25 instead of 0.4)
             is_speech_ast = speech_prob > 0.25
             result = VADResult(float(speech_prob), is_speech_ast, self.model_name, time.time()-start_time, timestamp)
             return result
         except Exception as e:
                 energy = np.sum(audio ** 2) / len(audio)  # Normalize by length
                 probability = min(energy * 100, 1.0)  # More conservative scaling
                 is_speech = energy > 0.001  # Lower threshold for fallback
             else:
                 probability = 0.0
                 is_speech = False
         self.model_windows = {
             "Silero-VAD": 0.032,  # 32ms exactly as required (512 samples)
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
+            "E-PANNs": 1.0,       # CHANGED from 6.0 to 1.0 for better temporal resolution
+            "PANNs": 1.0,         # CHANGED from 10.0 to 1.0 for better temporal resolution
+            "AST": 1.0            # 1 second for better temporal resolution
         }
+        # Model-specific hop sizes for efficiency - INCREASED to 20Hz
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
+            "E-PANNs": 0.05,      # CHANGED from 0.1 to 0.05 for 20Hz
+            "PANNs": 0.05,        # CHANGED from 0.1 to 0.05 for 20Hz
+            "AST": 0.05           # CHANGED from 0.1 to 0.05 for 20Hz
         }
         # Model-specific thresholds for better detection
                     model_results = []
+                    # CRITICAL FIX: Always extract chunks, both for short and long audio
+                    window_count = 0
+                    audio_duration = len(processed_audio) / self.processor.sample_rate
+                    for i in range(0, len(processed_audio), hop_samples):
+                        timestamp = i / self.processor.sample_rate
+                        # CRITICAL: Extract the chunk centered on this timestamp
+                        start_pos = max(0, i - window_samples // 2)
+                        end_pos = min(len(processed_audio), start_pos + window_samples)
+                        chunk = processed_audio[start_pos:end_pos]
+                        # Pad if necessary (with zeros, not repeating)
+                        if len(chunk) < window_samples:
+                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='constant')
+                        if window_count < 3:  # Log first 3 windows
+                            debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s, chunk_size={len(chunk)}")
+                        # Call predict with the chunk
+                        result = self.models[model_name].predict(chunk, timestamp)
+                        if window_count < 3:  # Log first 3 results
+                            debug_info.append(f"  📈 Result {window_count}: prob={result.probability:.4f}, speech={result.is_speech}")
+                        # Use model-specific threshold
+                        result.is_speech = result.probability > model_threshold
+                        vad_results.append(result)
+                        model_results.append(result)
+                        window_count += 1
+                        # Stop if we've gone past the audio length
+                        if timestamp >= audio_duration:
+                            break
+                    debug_info.append(f"  🎯 Total windows processed: {window_count}")
                     # Summary for this model
                     if model_results:
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
+        **Note**: All models now provide high temporal resolution (20Hz) for accurate real-time speech detection.
         """)
     return interface