Gabriel Bibbó commited on
Commit
11719c2
·
1 Parent(s): d2d5f15

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files
Files changed (1) hide show
  1. app.py +235 -86
app.py CHANGED
@@ -1,4 +1,150 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import torch
4
  import time
@@ -238,82 +384,77 @@ class OptimizedEPANNs:
238
  start_time = time.time()
239
 
240
  try:
241
- print(f"🔍 E-PANNs predict: audio_len={len(audio)}, timestamp={timestamp:.2f}")
242
-
243
  if len(audio) == 0:
244
- print("❌ E-PANNs: Empty audio")
245
  return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
246
 
247
  if len(audio.shape) > 1:
248
  audio = audio.mean(axis=1)
249
- print(f"🔄 E-PANNs: Converted to mono, new_len={len(audio)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  # Convert audio to target sample rate for E-PANNs
252
  if LIBROSA_AVAILABLE:
253
- print(f"🔄 E-PANNs: Resampling from 16kHz to {self.sample_rate}Hz")
254
- # Resample to E-PANNs sample rate if needed
255
- audio_resampled = librosa.resample(audio.astype(float),
256
  orig_sr=16000,
257
  target_sr=self.sample_rate)
258
- print(f"✅ E-PANNs: Resampled, new_len={len(audio_resampled)}")
259
 
260
  # For short audio, repeat it instead of padding with zeros
261
  min_samples = 6 * self.sample_rate # 6 seconds
262
  if len(audio_resampled) < min_samples:
263
- print(f"⚠️ E-PANNs: Repeating audio from {len(audio_resampled)} to {min_samples} samples")
264
  # Repeat the audio to fill the minimum required length
265
  num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
266
  audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
267
- print(f"✅ E-PANNs: Repeated, final_len={len(audio_resampled)}")
268
 
269
- print(f"🔄 E-PANNs: Computing features...")
 
 
270
 
271
- # Compute features on the actual audio portion (not the repeated part)
272
- actual_audio_len = min(len(audio_resampled), int(len(audio) * self.sample_rate / 16000))
273
  actual_audio = audio_resampled[:actual_audio_len]
274
 
275
- mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
276
- energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
277
  spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
278
-
279
- # Better speech detection using multiple features
280
  mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
281
  mfcc_var = np.var(mfcc, axis=1).mean()
282
-
283
- # Zero crossing rate - important for speech detection
284
  zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
285
 
286
- print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}, zcr={zcr:.4f}")
287
-
288
  # Adjusted scaling for better speech detection
289
- energy_score = np.clip((energy + 80) / 40, 0, 1) # More sensitive to energy
290
- centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1) # Better range for speech
291
- mfcc_score = np.clip(mfcc_var / 100, 0, 1) # Adjusted MFCC scaling
292
- zcr_score = np.clip(zcr * 10, 0, 1) # ZCR is typically 0.01-0.1 for speech
293
 
294
- # Weighted combination favoring energy and MFCC
295
  speech_score = (energy_score * 0.4 +
296
  centroid_score * 0.2 +
297
  mfcc_score * 0.3 +
298
  zcr_score * 0.1)
299
-
300
- print(f"📈 E-PANNs: energy_score={energy_score:.3f}, centroid_score={centroid_score:.3f}, mfcc_score={mfcc_score:.3f}, zcr_score={zcr_score:.3f}")
301
- print(f"📈 E-PANNs: final_speech_score={speech_score:.4f}")
302
  else:
303
- print("⚠️ E-PANNs: Using scipy fallback")
304
  from scipy import signal
305
  # Basic fallback without librosa
306
- f, t, Sxx = signal.spectrogram(audio, 16000) # Use original sample rate
307
  energy = np.mean(10 * np.log10(Sxx + 1e-10))
308
-
309
- # Simple energy-based detection as fallback
310
  speech_score = np.clip((energy + 100) / 50, 0, 1)
311
- print(f"📈 E-PANNs (fallback): energy={energy:.2f}, speech_score={speech_score:.4f}")
312
 
313
  probability = np.clip(speech_score, 0, 1)
314
- is_speech = probability > 0.4 # Use model threshold
315
-
316
- print(f"✅ E-PANNs: final_prob={probability:.4f}, is_speech={is_speech}")
317
 
318
  return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
319
 
@@ -346,17 +487,12 @@ class OptimizedPANNs:
346
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
347
  start_time = time.time()
348
 
349
- print(f"🔍 PANNs predict: audio_len={len(audio)}, timestamp={timestamp:.2f}, model_available={self.model is not None}")
350
-
351
  if self.model is None or len(audio) == 0:
352
- print(f"❌ PANNs: Model unavailable or empty audio")
353
  if len(audio) > 0:
354
  energy = np.sum(audio ** 2)
355
  threshold = 0.01
356
- # More conservative energy scaling for fallback
357
- probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
358
  is_speech = energy > threshold
359
- print(f"🔄 PANNs fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
360
  else:
361
  probability = 0.0
362
  is_speech = False
@@ -365,30 +501,42 @@ class OptimizedPANNs:
365
  try:
366
  if len(audio.shape) > 1:
367
  audio = audio.mean(axis=1)
368
- print(f"🔄 PANNs: Converted to mono")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
  # Convert audio to PANNs sample rate
371
  if LIBROSA_AVAILABLE:
372
- print(f"🔄 PANNs: Resampling from 16kHz to {self.sample_rate}Hz")
373
- audio_resampled = librosa.resample(audio.astype(float),
374
  orig_sr=16000,
375
  target_sr=self.sample_rate)
376
- print(f"✅ PANNs: Resampled, new_len={len(audio_resampled)}")
377
  else:
378
- print(f"⚠️ PANNs: Using simple resampling fallback")
379
  # Simple resampling fallback
380
  resample_factor = self.sample_rate / 16000
381
  audio_resampled = np.interp(
382
- np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
383
- np.arange(len(audio)),
384
- audio
385
  )
386
 
387
  # For short audio, use intelligent padding strategy
388
  min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
389
  if len(audio_resampled) < min_samples:
390
- print(f"⚠️ PANNs: Audio too short ({len(audio_resampled)} samples), using smart padding")
391
-
392
  # Strategy: repeat the audio cyclically to maintain characteristics
393
  num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
394
  audio_repeated = np.tile(audio_resampled, num_repeats)[:min_samples]
@@ -402,12 +550,9 @@ class OptimizedPANNs:
402
  audio_repeated[-fade_len:] *= fade_out
403
 
404
  audio_resampled = audio_repeated
405
- print(f"✅ PANNs: Smart padded, final_len={len(audio_resampled)}")
406
 
407
- print(f"🚀 PANNs: Running inference...")
408
- # Fix: PANNs inference doesn't take input_sr parameter
409
  clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
410
- print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
411
 
412
  # Enhanced speech detection using multiple relevant labels
413
  speech_keywords = [
@@ -421,8 +566,6 @@ class OptimizedPANNs:
421
  if any(word in lbl.lower() for word in speech_keywords):
422
  speech_indices.append(i)
423
 
424
- print(f"🔍 PANNs: Found {len(speech_indices)} speech-related labels")
425
-
426
  # Also get silence/noise indices for contrast
427
  noise_keywords = ['silence', 'white noise', 'pink noise']
428
  noise_indices = []
@@ -441,17 +584,13 @@ class OptimizedPANNs:
441
  # Adjust speech probability based on noise
442
  speech_prob = speech_prob * (1 - noise_prob * 0.5)
443
 
444
- print(f"📈 PANNs: raw_speech_prob={speech_prob:.4f}")
445
-
446
  # If using repeated audio, scale confidence based on original length
447
- if len(audio) < 16000 * 2: # Less than 2 seconds
448
- confidence_scale = len(audio) / (16000 * 2)
449
  speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
450
- print(f"🔧 PANNs: Scaled for short audio, final_prob={speech_prob:.4f}")
451
 
452
  else:
453
  # Fallback if no speech indices found
454
- print(f"⚠️ PANNs: No speech classes found, using top classes")
455
  top_indices = np.argsort(clip_probs[0])[-10:]
456
  speech_prob = np.mean(clip_probs[0, top_indices])
457
 
@@ -464,10 +603,8 @@ class OptimizedPANNs:
464
  if len(audio) > 0:
465
  energy = np.sum(audio ** 2)
466
  threshold = 0.01
467
- # More conservative energy scaling for error fallback
468
- probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
469
  is_speech = energy > threshold
470
- print(f"🔄 PANNs error fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
471
  else:
472
  probability = 0.0
473
  is_speech = False
@@ -738,9 +875,9 @@ class AudioProcessor:
738
  self.model_hop_sizes = {
739
  "Silero-VAD": 0.016, # 16ms hop for Silero (512 samples window)
740
  "WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
741
- "E-PANNs": 1.0, # Process every 1s but with 6s window
742
- "PANNs": 2.0, # Process every 2s but with 10s window
743
- "AST": 1.0 # Process every 1s but with 6.4s window
744
  }
745
 
746
  # Model-specific thresholds for better detection
@@ -1250,34 +1387,46 @@ class VADDemo:
1250
 
1251
  model_results = []
1252
 
1253
- # Critical fix: Always process at least once, even if audio is shorter than window
1254
  if len(processed_audio) < window_samples:
1255
- debug_info.append(f" ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing multiple times with overlap")
 
 
 
 
 
1256
 
1257
- # Generate multiple timestamps for visualization even with short audio
1258
- num_points = max(3, int(len(processed_audio) / self.processor.sample_rate)) # At least 3 points
1259
 
1260
- for point_idx in range(num_points):
1261
- timestamp = (point_idx / (num_points - 1)) * (len(processed_audio) / self.processor.sample_rate) if num_points > 1 else 0.0
1262
- chunk = processed_audio # Use full audio for each point
1263
 
1264
- debug_info.append(f" 🔄 Processing point {point_idx} at t={timestamp:.2f}s, size={len(chunk)}")
 
 
 
1265
 
1266
  # Special handling for different models
1267
  if model_name == 'AST':
1268
- result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
1269
  else:
1270
- result = self.models[model_name].predict(chunk, timestamp)
1271
-
1272
- # Update timestamp to spread points
1273
- result.timestamp = timestamp
1274
 
1275
- debug_info.append(f" 📈 Point {point_idx}: prob={result.probability:.4f}, speech={result.is_speech}")
 
1276
 
1277
  # Use model-specific threshold
1278
  result.is_speech = result.probability > model_threshold
1279
  vad_results.append(result)
1280
  model_results.append(result)
 
 
 
 
 
 
 
1281
  else:
1282
  # Audio is long enough - process in sliding windows
1283
  debug_info.append(f" ✅ Audio long enough, processing in windows")
 
1
+ def predict(self, audio: np.ndarray, timestamp: float = 0.0, full_audio: np.ndarray = None) -> VADResult:
2
+ start_time = time.time()
3
+
4
+ if self.model is None or len(audio) == 0:
5
+ # Enhanced fallback using spectral features
6
+ if len(audio) > 0:
7
+ energy = np.sum(audio ** 2)
8
+ if LIBROSA_AVAILABLE:
9
+ spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
10
+ spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
11
+ probability = min((energy * 100 + spectral_centroid / 1000) / 2, 1.0)
12
+ else:
13
+ probability = min(energy * 50, 1.0)
14
+ is_speech = probability > 0.25
15
+ else:
16
+ probability = 0.0
17
+ is_speech = False
18
+ return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
19
+
20
+ try:
21
+ # Cache key based on timestamp rounded to cache window
22
+ cache_key = int(timestamp / self.cache_window)
23
+
24
+ # Check cache first
25
+ if cache_key in self.prediction_cache:
26
+ cached_result = self.prediction_cache[cache_key]
27
+ # Return cached result with updated timestamp
28
+ return VADResult(
29
+ cached_result.probability,
30
+ cached_result.is_speech,
31
+ cached_result.model_name + " (cached)",
32
+ time.time() - start_time,
33
+ timestamp
34
+ )
35
+
36
+ if len(audio.shape) > 1:
37
+ audio = audio.mean(axis=1)
38
+
39
+ # Use longer context for AST - preferably 6.4 seconds (1024 frames)
40
+ window_duration = 6.4 # seconds
41
+ window_samples = int(window_duration * self.sample_rate)
42
+
43
+ # If full_audio is provided, use it for better context
44
+ if full_audio is not None and len(full_audio) > window_samples:
45
+ # Take window centered around current timestamp
46
+ center_pos = int(timestamp * self.sample_rate)
47
+ half_window = window_samples // 2
48
+
49
+ start_pos = max(0, center_pos - half_window)
50
+ end_pos = min(len(full_audio), start_pos + window_samples)
51
+
52
+ # Adjust if at the end of audio
53
+ if end_pos == len(full_audio) and end_pos - start_pos < window_samples:
54
+ start_pos = max(0, end_pos - window_samples)
55
+
56
+ audio_for_ast = full_audio[start_pos:end_pos]
57
+ else:
58
+ # Extract window from provided audio based on timestamp
59
+ center_sample = int(timestamp * self.sample_rate)
60
+ half_window = window_samples // 2
61
+
62
+ start_idx = max(0, center_sample - half_window)
63
+ end_idx = min(len(audio), start_idx + window_samples)
64
+
65
+ # Adjust if at the end
66
+ if end_idx == len(audio) and end_idx - start_idx < window_samples:
67
+ start_idx = max(0, end_idx - window_samples)
68
+
69
+ audio_for_ast = audio[start_idx:end_idx]
70
+
71
+ # For short audio, use intelligent strategy
72
+ min_samples = int(6.4 * self.sample_rate) # 6.4 seconds
73
+ if len(audio_for_ast) < min_samples:
74
+ # Repeat the audio cyclically to maintain temporal patterns
75
+ num_repeats = int(np.ceil(min_samples / len(audio_for_ast)))
76
+ audio_repeated = np.tile(audio_for_ast, num_repeats)[:min_samples]
77
+
78
+ # Apply smooth transitions at repetition boundaries
79
+ fade_samples = int(0.01 * self.sample_rate) # 10ms fade
80
+ for i in range(1, num_repeats):
81
+ if i * len(audio_for_ast) < len(audio_repeated):
82
+ start_idx = i * len(audio_for_ast) - fade_samples
83
+ end_idx = i * len(audio_for_ast) + fade_samples
84
+ if start_idx >= 0 and end_idx < len(audio_repeated):
85
+ audio_repeated[start_idx:end_idx] *= np.linspace(1, 1, 2 * fade_samples)
86
+
87
+ audio_for_ast = audio_repeated
88
+
89
+ # Truncate if too long
90
+ max_samples = 8 * self.sample_rate
91
+ if len(audio_for_ast) > max_samples:
92
+ audio_for_ast = audio_for_ast[:max_samples]
93
+
94
+ # Feature extraction
95
+ inputs = self.feature_extractor(
96
+ audio_for_ast,
97
+ sampling_rate=self.sample_rate,
98
+ return_tensors="pt",
99
+ max_length=1024,
100
+ padding="max_length",
101
+ truncation=True
102
+ )
103
+
104
+ # Move inputs to correct device and dtype
105
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
106
+ if self.device.type == 'cuda' and hasattr(self.model, 'half'):
107
+ inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
108
+
109
+ with torch.no_grad():
110
+ outputs = self.model(**inputs)
111
+ logits = outputs.logits
112
+ probs = torch.sigmoid(logits)
113
+
114
+ # Find speech-related classes
115
+ label2id = self.model.config.label2id
116
+ speech_indices = []
117
+ speech_keywords = [
118
+ 'speech', 'voice', 'talk', 'conversation', 'speaking',
119
+ 'male speech', 'female speech', 'child speech',
120
+ 'speech synthesizer', 'narration'
121
+ ]
122
+
123
+ for lbl, idx in label2id.items():
124
+ if any(word in lbl.lower() for word in speech_keywords):
125
+ speech_indices.append(idx)
126
+
127
+ # Also identify background/noise classes
128
+ noise_keywords = ['silence', 'white noise', 'background']
129
+ noise_indices = []
130
+ for lbl, idx in label2id.items():
131
+ if any(word in lbl.lower() for word in noise_keywords):
132
+ noise_indices.append(idx)
133
+
134
+ if speech_indices:
135
+ # Use max probability among speech classes
136
+ speech_probs = probs[0, speech_indices]
137
+ speech_prob = torch.max(speech_probs).item()
138
+
139
+ # Consider noise/silence probability
140
+ if noise_indices:
141
+ noise_prob = torch.mean(probs[0, noise_indices]).item()
142
+ speech_prob = speech_prob * (1 - noise_prob * 0.3)
143
+
144
+ # Adjust confidence for short audio
145
+ if len(audio) < self.sample_rate * 2:
146
+ confidence_factor = len(audio) / (self.sample_rate * 2)
147
+ speech_prob = speech_prob * (0.6 + 0.4 *import gradio as gr
148
  import numpy as np
149
  import torch
150
  import time
 
384
  start_time = time.time()
385
 
386
  try:
 
 
387
  if len(audio) == 0:
 
388
  return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
389
 
390
  if len(audio.shape) > 1:
391
  audio = audio.mean(axis=1)
392
+
393
+ # For E-PANNs, we need to extract the appropriate window based on timestamp
394
+ window_duration = 6.0 # 6 seconds window for E-PANNs
395
+ window_samples = int(window_duration * 16000) # at 16kHz input rate
396
+
397
+ # Calculate the center position for this timestamp
398
+ center_sample = int(timestamp * 16000)
399
+ half_window = window_samples // 2
400
+
401
+ # Extract window centered at timestamp
402
+ start_idx = max(0, center_sample - half_window)
403
+ end_idx = min(len(audio), start_idx + window_samples)
404
+
405
+ # Adjust start if we're at the end of audio
406
+ if end_idx == len(audio) and end_idx - start_idx < window_samples:
407
+ start_idx = max(0, end_idx - window_samples)
408
+
409
+ audio_window = audio[start_idx:end_idx]
410
 
411
  # Convert audio to target sample rate for E-PANNs
412
  if LIBROSA_AVAILABLE:
413
+ # Resample to E-PANNs sample rate
414
+ audio_resampled = librosa.resample(audio_window.astype(float),
 
415
  orig_sr=16000,
416
  target_sr=self.sample_rate)
 
417
 
418
  # For short audio, repeat it instead of padding with zeros
419
  min_samples = 6 * self.sample_rate # 6 seconds
420
  if len(audio_resampled) < min_samples:
 
421
  # Repeat the audio to fill the minimum required length
422
  num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
423
  audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
 
424
 
425
+ # Compute features
426
+ mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
427
+ energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
428
 
429
+ # Use actual non-repeated audio for some features
430
+ actual_audio_len = min(len(audio_resampled), int(len(audio_window) * self.sample_rate / 16000))
431
  actual_audio = audio_resampled[:actual_audio_len]
432
 
 
 
433
  spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
 
 
434
  mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
435
  mfcc_var = np.var(mfcc, axis=1).mean()
 
 
436
  zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
437
 
 
 
438
  # Adjusted scaling for better speech detection
439
+ energy_score = np.clip((energy + 80) / 40, 0, 1)
440
+ centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
441
+ mfcc_score = np.clip(mfcc_var / 100, 0, 1)
442
+ zcr_score = np.clip(zcr * 10, 0, 1)
443
 
444
+ # Weighted combination
445
  speech_score = (energy_score * 0.4 +
446
  centroid_score * 0.2 +
447
  mfcc_score * 0.3 +
448
  zcr_score * 0.1)
 
 
 
449
  else:
 
450
  from scipy import signal
451
  # Basic fallback without librosa
452
+ f, t, Sxx = signal.spectrogram(audio_window, 16000)
453
  energy = np.mean(10 * np.log10(Sxx + 1e-10))
 
 
454
  speech_score = np.clip((energy + 100) / 50, 0, 1)
 
455
 
456
  probability = np.clip(speech_score, 0, 1)
457
+ is_speech = probability > 0.4
 
 
458
 
459
  return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
460
 
 
487
  def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
488
  start_time = time.time()
489
 
 
 
490
  if self.model is None or len(audio) == 0:
 
491
  if len(audio) > 0:
492
  energy = np.sum(audio ** 2)
493
  threshold = 0.01
494
+ probability = min(energy / (threshold * 100), 1.0)
 
495
  is_speech = energy > threshold
 
496
  else:
497
  probability = 0.0
498
  is_speech = False
 
501
  try:
502
  if len(audio.shape) > 1:
503
  audio = audio.mean(axis=1)
504
+
505
+ # For PANNs, extract the appropriate window based on timestamp
506
+ window_duration = 10.0 # 10 seconds window for PANNs
507
+ window_samples = int(window_duration * 16000) # at 16kHz input rate
508
+
509
+ # Calculate the center position for this timestamp
510
+ center_sample = int(timestamp * 16000)
511
+ half_window = window_samples // 2
512
+
513
+ # Extract window centered at timestamp
514
+ start_idx = max(0, center_sample - half_window)
515
+ end_idx = min(len(audio), start_idx + window_samples)
516
+
517
+ # Adjust start if we're at the end of audio
518
+ if end_idx == len(audio) and end_idx - start_idx < window_samples:
519
+ start_idx = max(0, end_idx - window_samples)
520
+
521
+ audio_window = audio[start_idx:end_idx]
522
 
523
  # Convert audio to PANNs sample rate
524
  if LIBROSA_AVAILABLE:
525
+ audio_resampled = librosa.resample(audio_window.astype(float),
 
526
  orig_sr=16000,
527
  target_sr=self.sample_rate)
 
528
  else:
 
529
  # Simple resampling fallback
530
  resample_factor = self.sample_rate / 16000
531
  audio_resampled = np.interp(
532
+ np.linspace(0, len(audio_window) - 1, int(len(audio_window) * resample_factor)),
533
+ np.arange(len(audio_window)),
534
+ audio_window
535
  )
536
 
537
  # For short audio, use intelligent padding strategy
538
  min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
539
  if len(audio_resampled) < min_samples:
 
 
540
  # Strategy: repeat the audio cyclically to maintain characteristics
541
  num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
542
  audio_repeated = np.tile(audio_resampled, num_repeats)[:min_samples]
 
550
  audio_repeated[-fade_len:] *= fade_out
551
 
552
  audio_resampled = audio_repeated
 
553
 
554
+ # Run inference
 
555
  clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
 
556
 
557
  # Enhanced speech detection using multiple relevant labels
558
  speech_keywords = [
 
566
  if any(word in lbl.lower() for word in speech_keywords):
567
  speech_indices.append(i)
568
 
 
 
569
  # Also get silence/noise indices for contrast
570
  noise_keywords = ['silence', 'white noise', 'pink noise']
571
  noise_indices = []
 
584
  # Adjust speech probability based on noise
585
  speech_prob = speech_prob * (1 - noise_prob * 0.5)
586
 
 
 
587
  # If using repeated audio, scale confidence based on original length
588
+ if len(audio_window) < 16000 * 2: # Less than 2 seconds
589
+ confidence_scale = len(audio_window) / (16000 * 2)
590
  speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
 
591
 
592
  else:
593
  # Fallback if no speech indices found
 
594
  top_indices = np.argsort(clip_probs[0])[-10:]
595
  speech_prob = np.mean(clip_probs[0, top_indices])
596
 
 
603
  if len(audio) > 0:
604
  energy = np.sum(audio ** 2)
605
  threshold = 0.01
606
+ probability = min(energy / (threshold * 100), 1.0)
 
607
  is_speech = energy > threshold
 
608
  else:
609
  probability = 0.0
610
  is_speech = False
 
875
  self.model_hop_sizes = {
876
  "Silero-VAD": 0.016, # 16ms hop for Silero (512 samples window)
877
  "WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
878
+ "E-PANNs": 0.1, # 100ms hop for 10 predictions/second
879
+ "PANNs": 0.1, # 100ms hop for 10 predictions/second
880
+ "AST": 0.1 # 100ms hop for 10 predictions/second
881
  }
882
 
883
  # Model-specific thresholds for better detection
 
1387
 
1388
  model_results = []
1389
 
1390
+ # Always use sliding window approach for consistent temporal resolution
1391
  if len(processed_audio) < window_samples:
1392
+ debug_info.append(f" ⚠️ Audio shorter than window ({len(processed_audio)} < {window_samples}), using sliding window with padding")
1393
+
1394
+ # For short audio, still use sliding window but with the actual audio length
1395
+ # This ensures we get the desired temporal resolution (10 predictions/second)
1396
+ window_count = 0
1397
+ audio_duration = len(processed_audio) / self.processor.sample_rate
1398
 
1399
+ # Calculate number of windows based on hop size
1400
+ num_windows = max(1, int((audio_duration - window_size) / hop_size) + 1) if audio_duration > window_size else max(1, int(audio_duration / hop_size))
1401
 
1402
+ for i in range(0, len(processed_audio), hop_samples):
1403
+ timestamp = i / self.processor.sample_rate
 
1404
 
1405
+ # For models that need long context, we'll use the full audio padded/repeated as needed
1406
+ # but report the timestamp based on the sliding window position
1407
+ if window_count < 3: # Log first 3 windows
1408
+ debug_info.append(f" 🔄 Window {window_count}: t={timestamp:.2f}s")
1409
 
1410
  # Special handling for different models
1411
  if model_name == 'AST':
1412
+ result = self.models[model_name].predict(processed_audio, timestamp, full_audio=processed_audio)
1413
  else:
1414
+ result = self.models[model_name].predict(processed_audio, timestamp)
 
 
 
1415
 
1416
+ if window_count < 3: # Log first 3 results
1417
+ debug_info.append(f" 📈 Result {window_count}: prob={result.probability:.4f}, speech={result.is_speech}")
1418
 
1419
  # Use model-specific threshold
1420
  result.is_speech = result.probability > model_threshold
1421
  vad_results.append(result)
1422
  model_results.append(result)
1423
+ window_count += 1
1424
+
1425
+ # Stop if we've gone past the audio length
1426
+ if timestamp >= audio_duration:
1427
+ break
1428
+
1429
+ debug_info.append(f" 🎯 Total windows processed: {window_count}")
1430
  else:
1431
  # Audio is long enough - process in sliding windows
1432
  debug_info.append(f" ✅ Audio long enough, processing in windows")