Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

11719c2

1 Parent(s): d2d5f15

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +235 -86

app.py CHANGED Viewed

@@ -1,4 +1,150 @@
-import gradio as gr
 import numpy as np
 import torch
 import time
@@ -238,82 +384,77 @@ class OptimizedEPANNs:
         start_time = time.time()
         try:
-            print(f"🔍 E-PANNs predict: audio_len={len(audio)}, timestamp={timestamp:.2f}")
             if len(audio) == 0:
-                print("❌ E-PANNs: Empty audio")
                 return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-                print(f"🔄 E-PANNs: Converted to mono, new_len={len(audio)}")
             # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
-                print(f"🔄 E-PANNs: Resampling from 16kHz to {self.sample_rate}Hz")
-                # Resample to E-PANNs sample rate if needed
-                audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
-                print(f"✅ E-PANNs: Resampled, new_len={len(audio_resampled)}")
                 # For short audio, repeat it instead of padding with zeros
                 min_samples = 6 * self.sample_rate  # 6 seconds
                 if len(audio_resampled) < min_samples:
-                    print(f"⚠️ E-PANNs: Repeating audio from {len(audio_resampled)} to {min_samples} samples")
                     # Repeat the audio to fill the minimum required length
                     num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                     audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
-                    print(f"✅ E-PANNs: Repeated, final_len={len(audio_resampled)}")
-                print(f"🔄 E-PANNs: Computing features...")
-                # Compute features on the actual audio portion (not the repeated part)
-                actual_audio_len = min(len(audio_resampled), int(len(audio) * self.sample_rate / 16000))
                 actual_audio = audio_resampled[:actual_audio_len]
-                mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
-                energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
-                # Better speech detection using multiple features
                 mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
                 mfcc_var = np.var(mfcc, axis=1).mean()
-                # Zero crossing rate - important for speech detection
                 zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
-                print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}, zcr={zcr:.4f}")
                 # Adjusted scaling for better speech detection
-                energy_score = np.clip((energy + 80) / 40, 0, 1)  # More sensitive to energy
-                centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)  # Better range for speech
-                mfcc_score = np.clip(mfcc_var / 100, 0, 1)  # Adjusted MFCC scaling
-                zcr_score = np.clip(zcr * 10, 0, 1)  # ZCR is typically 0.01-0.1 for speech
-                # Weighted combination favoring energy and MFCC
                 speech_score = (energy_score * 0.4 +
                               centroid_score * 0.2 +
                               mfcc_score * 0.3 +
                               zcr_score * 0.1)
-                print(f"📈 E-PANNs: energy_score={energy_score:.3f}, centroid_score={centroid_score:.3f}, mfcc_score={mfcc_score:.3f}, zcr_score={zcr_score:.3f}")
-                print(f"📈 E-PANNs: final_speech_score={speech_score:.4f}")
             else:
-                print("⚠️ E-PANNs: Using scipy fallback")
                 from scipy import signal
                 # Basic fallback without librosa
-                f, t, Sxx = signal.spectrogram(audio, 16000)  # Use original sample rate
                 energy = np.mean(10 * np.log10(Sxx + 1e-10))
-                # Simple energy-based detection as fallback
                 speech_score = np.clip((energy + 100) / 50, 0, 1)
-                print(f"📈 E-PANNs (fallback): energy={energy:.2f}, speech_score={speech_score:.4f}")
             probability = np.clip(speech_score, 0, 1)
-            is_speech = probability > 0.4  # Use model threshold
-            print(f"✅ E-PANNs: final_prob={probability:.4f}, is_speech={is_speech}")
             return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
@@ -346,17 +487,12 @@ class OptimizedPANNs:
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
-        print(f"🔍 PANNs predict: audio_len={len(audio)}, timestamp={timestamp:.2f}, model_available={self.model is not None}")
         if self.model is None or len(audio) == 0:
-            print(f"❌ PANNs: Model unavailable or empty audio")
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
-                # More conservative energy scaling for fallback
-                probability = min(energy / (threshold * 100), 1.0)  # Divide by 100 to reduce sensitivity
                 is_speech = energy > threshold
-                print(f"🔄 PANNs fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
@@ -365,30 +501,42 @@ class OptimizedPANNs:
         try:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-                print(f"🔄 PANNs: Converted to mono")
             # Convert audio to PANNs sample rate
             if LIBROSA_AVAILABLE:
-                print(f"🔄 PANNs: Resampling from 16kHz to {self.sample_rate}Hz")
-                audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
-                print(f"✅ PANNs: Resampled, new_len={len(audio_resampled)}")
             else:
-                print(f"⚠️ PANNs: Using simple resampling fallback")
                 # Simple resampling fallback
                 resample_factor = self.sample_rate / 16000
                 audio_resampled = np.interp(
-                    np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
-                    np.arange(len(audio)),
-                    audio
                 )
             # For short audio, use intelligent padding strategy
             min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
-                print(f"⚠️ PANNs: Audio too short ({len(audio_resampled)} samples), using smart padding")
                 # Strategy: repeat the audio cyclically to maintain characteristics
                 num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                 audio_repeated = np.tile(audio_resampled, num_repeats)[:min_samples]
@@ -402,12 +550,9 @@ class OptimizedPANNs:
                 audio_repeated[-fade_len:] *= fade_out
                 audio_resampled = audio_repeated
-                print(f"✅ PANNs: Smart padded, final_len={len(audio_resampled)}")
-            print(f"🚀 PANNs: Running inference...")
-            # Fix: PANNs inference doesn't take input_sr parameter
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
-            print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
             # Enhanced speech detection using multiple relevant labels
             speech_keywords = [
@@ -421,8 +566,6 @@ class OptimizedPANNs:
                 if any(word in lbl.lower() for word in speech_keywords):
                     speech_indices.append(i)
-            print(f"🔍 PANNs: Found {len(speech_indices)} speech-related labels")
             # Also get silence/noise indices for contrast
             noise_keywords = ['silence', 'white noise', 'pink noise']
             noise_indices = []
@@ -441,17 +584,13 @@ class OptimizedPANNs:
                     # Adjust speech probability based on noise
                     speech_prob = speech_prob * (1 - noise_prob * 0.5)
-                print(f"📈 PANNs: raw_speech_prob={speech_prob:.4f}")
                 # If using repeated audio, scale confidence based on original length
-                if len(audio) < 16000 * 2:  # Less than 2 seconds
-                    confidence_scale = len(audio) / (16000 * 2)
                     speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
-                    print(f"🔧 PANNs: Scaled for short audio, final_prob={speech_prob:.4f}")
             else:
                 # Fallback if no speech indices found
-                print(f"⚠️ PANNs: No speech classes found, using top classes")
                 top_indices = np.argsort(clip_probs[0])[-10:]
                 speech_prob = np.mean(clip_probs[0, top_indices])
@@ -464,10 +603,8 @@ class OptimizedPANNs:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
-                # More conservative energy scaling for error fallback
-                probability = min(energy / (threshold * 100), 1.0)  # Divide by 100 to reduce sensitivity
                 is_speech = energy > threshold
-                print(f"🔄 PANNs error fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
@@ -738,9 +875,9 @@ class AudioProcessor:
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
-            "E-PANNs": 1.0,       # Process every 1s but with 6s window
-            "PANNs": 2.0,         # Process every 2s but with 10s window
-            "AST": 1.0            # Process every 1s but with 6.4s window
         }
         # Model-specific thresholds for better detection
@@ -1250,34 +1387,46 @@ class VADDemo:
                     model_results = []
-                    # Critical fix: Always process at least once, even if audio is shorter than window
                     if len(processed_audio) < window_samples:
-                        debug_info.append(f"  ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing multiple times with overlap")
-                        # Generate multiple timestamps for visualization even with short audio
-                        num_points = max(3, int(len(processed_audio) / self.processor.sample_rate))  # At least 3 points
-                        for point_idx in range(num_points):
-                            timestamp = (point_idx / (num_points - 1)) * (len(processed_audio) / self.processor.sample_rate) if num_points > 1 else 0.0
-                            chunk = processed_audio  # Use full audio for each point
-                            debug_info.append(f"  🔄 Processing point {point_idx} at t={timestamp:.2f}s, size={len(chunk)}")
                             # Special handling for different models
                             if model_name == 'AST':
-                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
                             else:
-                                result = self.models[model_name].predict(chunk, timestamp)
-                            # Update timestamp to spread points
-                            result.timestamp = timestamp
-                            debug_info.append(f"  📈 Point {point_idx}: prob={result.probability:.4f}, speech={result.is_speech}")
                             # Use model-specific threshold
                             result.is_speech = result.probability > model_threshold
                             vad_results.append(result)
                             model_results.append(result)
                     else:
                         # Audio is long enough - process in sliding windows
                         debug_info.append(f"  ✅ Audio long enough, processing in windows")

+def predict(self, audio: np.ndarray, timestamp: float = 0.0, full_audio: np.ndarray = None) -> VADResult:
+        start_time = time.time()
+        if self.model is None or len(audio) == 0:
+            # Enhanced fallback using spectral features
+            if len(audio) > 0:
+                energy = np.sum(audio ** 2)
+                if LIBROSA_AVAILABLE:
+                    spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
+                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
+                    probability = min((energy * 100 + spectral_centroid / 1000) / 2, 1.0)
+                else:
+                    probability = min(energy * 50, 1.0)
+                is_speech = probability > 0.25
+            else:
+                probability = 0.0
+                is_speech = False
+            return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
+        try:
+            # Cache key based on timestamp rounded to cache window
+            cache_key = int(timestamp / self.cache_window)
+            # Check cache first
+            if cache_key in self.prediction_cache:
+                cached_result = self.prediction_cache[cache_key]
+                # Return cached result with updated timestamp
+                return VADResult(
+                    cached_result.probability,
+                    cached_result.is_speech,
+                    cached_result.model_name + " (cached)",
+                    time.time() - start_time,
+                    timestamp
+                )
+            if len(audio.shape) > 1:
+                audio = audio.mean(axis=1)
+            # Use longer context for AST - preferably 6.4 seconds (1024 frames)
+            window_duration = 6.4  # seconds
+            window_samples = int(window_duration * self.sample_rate)
+            # If full_audio is provided, use it for better context
+            if full_audio is not None and len(full_audio) > window_samples:
+                # Take window centered around current timestamp
+                center_pos = int(timestamp * self.sample_rate)
+                half_window = window_samples // 2
+                start_pos = max(0, center_pos - half_window)
+                end_pos = min(len(full_audio), start_pos + window_samples)
+                # Adjust if at the end of audio
+                if end_pos == len(full_audio) and end_pos - start_pos < window_samples:
+                    start_pos = max(0, end_pos - window_samples)
+                audio_for_ast = full_audio[start_pos:end_pos]
+            else:
+                # Extract window from provided audio based on timestamp
+                center_sample = int(timestamp * self.sample_rate)
+                half_window = window_samples // 2
+                start_idx = max(0, center_sample - half_window)
+                end_idx = min(len(audio), start_idx + window_samples)
+                # Adjust if at the end
+                if end_idx == len(audio) and end_idx - start_idx < window_samples:
+                    start_idx = max(0, end_idx - window_samples)
+                audio_for_ast = audio[start_idx:end_idx]
+            # For short audio, use intelligent strategy
+            min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
+            if len(audio_for_ast) < min_samples:
+                # Repeat the audio cyclically to maintain temporal patterns
+                num_repeats = int(np.ceil(min_samples / len(audio_for_ast)))
+                audio_repeated = np.tile(audio_for_ast, num_repeats)[:min_samples]
+                # Apply smooth transitions at repetition boundaries
+                fade_samples = int(0.01 * self.sample_rate)  # 10ms fade
+                for i in range(1, num_repeats):
+                    if i * len(audio_for_ast) < len(audio_repeated):
+                        start_idx = i * len(audio_for_ast) - fade_samples
+                        end_idx = i * len(audio_for_ast) + fade_samples
+                        if start_idx >= 0 and end_idx < len(audio_repeated):
+                            audio_repeated[start_idx:end_idx] *= np.linspace(1, 1, 2 * fade_samples)
+                audio_for_ast = audio_repeated
+            # Truncate if too long
+            max_samples = 8 * self.sample_rate
+            if len(audio_for_ast) > max_samples:
+                audio_for_ast = audio_for_ast[:max_samples]
+            # Feature extraction
+            inputs = self.feature_extractor(
+                audio_for_ast,
+                sampling_rate=self.sample_rate,
+                return_tensors="pt",
+                max_length=1024,
+                padding="max_length",
+                truncation=True
+            )
+            # Move inputs to correct device and dtype
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            if self.device.type == 'cuda' and hasattr(self.model, 'half'):
+                inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                logits = outputs.logits
+                probs = torch.sigmoid(logits)
+            # Find speech-related classes
+            label2id = self.model.config.label2id
+            speech_indices = []
+            speech_keywords = [
+                'speech', 'voice', 'talk', 'conversation', 'speaking',
+                'male speech', 'female speech', 'child speech',
+                'speech synthesizer', 'narration'
+            ]
+            for lbl, idx in label2id.items():
+                if any(word in lbl.lower() for word in speech_keywords):
+                    speech_indices.append(idx)
+            # Also identify background/noise classes
+            noise_keywords = ['silence', 'white noise', 'background']
+            noise_indices = []
+            for lbl, idx in label2id.items():
+                if any(word in lbl.lower() for word in noise_keywords):
+                    noise_indices.append(idx)
+            if speech_indices:
+                # Use max probability among speech classes
+                speech_probs = probs[0, speech_indices]
+                speech_prob = torch.max(speech_probs).item()
+                # Consider noise/silence probability
+                if noise_indices:
+                    noise_prob = torch.mean(probs[0, noise_indices]).item()
+                    speech_prob = speech_prob * (1 - noise_prob * 0.3)
+                # Adjust confidence for short audio
+                if len(audio) < self.sample_rate * 2:
+                    confidence_factor = len(audio) / (self.sample_rate * 2)
+                    speech_prob = speech_prob * (0.6 + 0.4 *import gradio as gr
 import numpy as np
 import torch
 import time
         start_time = time.time()
         try:
             if len(audio) == 0:
                 return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # For E-PANNs, we need to extract the appropriate window based on timestamp
+            window_duration = 6.0  # 6 seconds window for E-PANNs
+            window_samples = int(window_duration * 16000)  # at 16kHz input rate
+            # Calculate the center position for this timestamp
+            center_sample = int(timestamp * 16000)
+            half_window = window_samples // 2
+            # Extract window centered at timestamp
+            start_idx = max(0, center_sample - half_window)
+            end_idx = min(len(audio), start_idx + window_samples)
+            # Adjust start if we're at the end of audio
+            if end_idx == len(audio) and end_idx - start_idx < window_samples:
+                start_idx = max(0, end_idx - window_samples)
+            audio_window = audio[start_idx:end_idx]
             # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
+                # Resample to E-PANNs sample rate
+                audio_resampled = librosa.resample(audio_window.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
                 # For short audio, repeat it instead of padding with zeros
                 min_samples = 6 * self.sample_rate  # 6 seconds
                 if len(audio_resampled) < min_samples:
                     # Repeat the audio to fill the minimum required length
                     num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                     audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
+                # Compute features
+                mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
+                energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
+                # Use actual non-repeated audio for some features
+                actual_audio_len = min(len(audio_resampled), int(len(audio_window) * self.sample_rate / 16000))
                 actual_audio = audio_resampled[:actual_audio_len]
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
                 mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
                 mfcc_var = np.var(mfcc, axis=1).mean()
                 zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
                 # Adjusted scaling for better speech detection
+                energy_score = np.clip((energy + 80) / 40, 0, 1)
+                centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
+                mfcc_score = np.clip(mfcc_var / 100, 0, 1)
+                zcr_score = np.clip(zcr * 10, 0, 1)
+                # Weighted combination
                 speech_score = (energy_score * 0.4 +
                               centroid_score * 0.2 +
                               mfcc_score * 0.3 +
                               zcr_score * 0.1)
             else:
                 from scipy import signal
                 # Basic fallback without librosa
+                f, t, Sxx = signal.spectrogram(audio_window, 16000)
                 energy = np.mean(10 * np.log10(Sxx + 1e-10))
                 speech_score = np.clip((energy + 100) / 50, 0, 1)
             probability = np.clip(speech_score, 0, 1)
+            is_speech = probability > 0.4
             return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
         if self.model is None or len(audio) == 0:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
+                probability = min(energy / (threshold * 100), 1.0)
                 is_speech = energy > threshold
             else:
                 probability = 0.0
                 is_speech = False
         try:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # For PANNs, extract the appropriate window based on timestamp
+            window_duration = 10.0  # 10 seconds window for PANNs
+            window_samples = int(window_duration * 16000)  # at 16kHz input rate
+            # Calculate the center position for this timestamp
+            center_sample = int(timestamp * 16000)
+            half_window = window_samples // 2
+            # Extract window centered at timestamp
+            start_idx = max(0, center_sample - half_window)
+            end_idx = min(len(audio), start_idx + window_samples)
+            # Adjust start if we're at the end of audio
+            if end_idx == len(audio) and end_idx - start_idx < window_samples:
+                start_idx = max(0, end_idx - window_samples)
+            audio_window = audio[start_idx:end_idx]
             # Convert audio to PANNs sample rate
             if LIBROSA_AVAILABLE:
+                audio_resampled = librosa.resample(audio_window.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
             else:
                 # Simple resampling fallback
                 resample_factor = self.sample_rate / 16000
                 audio_resampled = np.interp(
+                    np.linspace(0, len(audio_window) - 1, int(len(audio_window) * resample_factor)),
+                    np.arange(len(audio_window)),
+                    audio_window
                 )
             # For short audio, use intelligent padding strategy
             min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
                 # Strategy: repeat the audio cyclically to maintain characteristics
                 num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                 audio_repeated = np.tile(audio_resampled, num_repeats)[:min_samples]
                 audio_repeated[-fade_len:] *= fade_out
                 audio_resampled = audio_repeated
+            # Run inference
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
             # Enhanced speech detection using multiple relevant labels
             speech_keywords = [
                 if any(word in lbl.lower() for word in speech_keywords):
                     speech_indices.append(i)
             # Also get silence/noise indices for contrast
             noise_keywords = ['silence', 'white noise', 'pink noise']
             noise_indices = []
                     # Adjust speech probability based on noise
                     speech_prob = speech_prob * (1 - noise_prob * 0.5)
                 # If using repeated audio, scale confidence based on original length
+                if len(audio_window) < 16000 * 2:  # Less than 2 seconds
+                    confidence_scale = len(audio_window) / (16000 * 2)
                     speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
             else:
                 # Fallback if no speech indices found
                 top_indices = np.argsort(clip_probs[0])[-10:]
                 speech_prob = np.mean(clip_probs[0, top_indices])
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
+                probability = min(energy / (threshold * 100), 1.0)
                 is_speech = energy > threshold
             else:
                 probability = 0.0
                 is_speech = False
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
+            "E-PANNs": 0.1,       # 100ms hop for 10 predictions/second
+            "PANNs": 0.1,         # 100ms hop for 10 predictions/second
+            "AST": 0.1            # 100ms hop for 10 predictions/second
         }
         # Model-specific thresholds for better detection
                     model_results = []
+                    # Always use sliding window approach for consistent temporal resolution
                     if len(processed_audio) < window_samples:
+                        debug_info.append(f"  ⚠️ Audio shorter than window ({len(processed_audio)} < {window_samples}), using sliding window with padding")
+                        # For short audio, still use sliding window but with the actual audio length
+                        # This ensures we get the desired temporal resolution (10 predictions/second)
+                        window_count = 0
+                        audio_duration = len(processed_audio) / self.processor.sample_rate
+                        # Calculate number of windows based on hop size
+                        num_windows = max(1, int((audio_duration - window_size) / hop_size) + 1) if audio_duration > window_size else max(1, int(audio_duration / hop_size))
+                        for i in range(0, len(processed_audio), hop_samples):
+                            timestamp = i / self.processor.sample_rate
+                            # For models that need long context, we'll use the full audio padded/repeated as needed
+                            # but report the timestamp based on the sliding window position
+                            if window_count < 3:  # Log first 3 windows
+                                debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s")
                             # Special handling for different models
                             if model_name == 'AST':
+                                result = self.models[model_name].predict(processed_audio, timestamp, full_audio=processed_audio)
                             else:
+                                result = self.models[model_name].predict(processed_audio, timestamp)
+                            if window_count < 3:  # Log first 3 results
+                                debug_info.append(f"  📈 Result {window_count}: prob={result.probability:.4f}, speech={result.is_speech}")
                             # Use model-specific threshold
                             result.is_speech = result.probability > model_threshold
                             vad_results.append(result)
                             model_results.append(result)
+                            window_count += 1
+                            # Stop if we've gone past the audio length
+                            if timestamp >= audio_duration:
+                                break
+                        debug_info.append(f"  🎯 Total windows processed: {window_count}")
                     else:
                         # Audio is long enough - process in sliding windows
                         debug_info.append(f"  ✅ Audio long enough, processing in windows")