Spaces:
Running
Running
| # NEW-ASR-VOX | |
| # ============================================================================== | |
| # Cell 1: Complete Setup - Based on Your Working VoxLingua Code | |
| # ============================================================================== | |
| import os, re, glob, csv | |
| import torch | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score, confusion_matrix | |
| from speechbrain.inference.classifiers import EncoderClassifier | |
| from speechbrain.pretrained.interfaces import foreign_class | |
| import torchaudio | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # ============================================================================== | |
| # Cell 2: Load Multiple Language Detection Models for Ensemble | |
| # ============================================================================== | |
| print("π Loading Multiple Language Detection Models...") | |
| # Model 1: VoxLingua107 ECAPA-TDNN (Your working baseline - 40% weight) | |
| voxlingua_model = None | |
| try: | |
| print("Loading VoxLingua107 ECAPA-TDNN...") | |
| voxlingua_model = EncoderClassifier.from_hparams( | |
| source="speechbrain/lang-id-voxlingua107-ecapa", | |
| savedir="pretrained_models/langid_voxlingua107_ecapa", | |
| run_opts={"device": device} | |
| ) | |
| print("β VoxLingua107 loaded successfully") | |
| except Exception as e: | |
| print(f"β VoxLingua107 failed: {e}") | |
| # Model 2: XLS-R Language ID (35% weight) | |
| xlsr_lid_model = None | |
| try: | |
| print("Loading TalTechNLP XLS-R Language ID...") | |
| xlsr_lid_model = foreign_class( | |
| source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec", | |
| pymodule_file="encoder_wav2vec_classifier.py", | |
| classname="EncoderWav2vecClassifier", | |
| hparams_file="inference_wav2vec.yaml", | |
| savedir="pretrained_models/xlsr_voxlingua", | |
| run_opts={"device": device} | |
| ) | |
| print("β XLS-R Language ID loaded successfully") | |
| except Exception as e: | |
| print(f"β XLS-R failed: {e}") | |
| models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model]) | |
| print(f"\nπ Models loaded: {models_loaded}/2") | |
| # ============================================================================== | |
| # Cell 3: Complete Language Mappings from Your Dataset | |
| # ============================================================================== | |
| # All languages from your dataset (based on the accuracy table you showed) | |
| DATASET_LANGUAGES = { | |
| # Indo-Aryan Languages | |
| 'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or', | |
| # Dravidian Languages | |
| 'ta', 'te', 'kn', 'ml', | |
| # Low-Resource Languages | |
| 'sd', 'kok', 'br', 'doi', 'sat', 'mni', | |
| # Others in your dataset | |
| 'sa' # Sanskrit | |
| } | |
| # Language Family Classifications | |
| INDO_ARYAN_LANGS = {'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or', 'sd'} | |
| DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'} | |
| LOW_RESOURCE_LANGS = {'kok', 'br', 'doi', 'sat', 'mni'} | |
| OTHER_LANGS = {'sa'} # Sanskrit | |
| ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS | OTHER_LANGS | |
| # Cross-Lingual Transfer Mappings (Research-Based) | |
| TRANSFER_MAPPINGS = { | |
| # Low-resource to high-resource language mappings | |
| 'br': 'hi', # Bodo β Hindi (brx mapped to br in your dataset) | |
| 'sat': 'hi', # Santali β Hindi | |
| 'doi': 'pa', # Dogri β Punjabi | |
| 'mni': 'bn', # Manipuri β Bengali | |
| 'kok': 'mr', # Konkani β Marathi (geographic proximity) | |
| 'sd': 'hi', # Sindhi β Hindi | |
| } | |
| # Language Code Mappings (VoxLingua output to your dataset codes) | |
| VOXLINGUA_TO_DATASET = { | |
| 'urd': 'ur', 'urdu': 'ur', | |
| 'pan': 'pa', 'punjabi': 'pa', 'pnb': 'pa', | |
| 'hin': 'hi', 'hindi': 'hi', | |
| 'ben': 'bn', 'bengali': 'bn', | |
| 'nep': 'ne', 'nepali': 'ne', | |
| 'asm': 'as', 'assamese': 'as', | |
| 'kas': 'ks', 'kashmiri': 'ks', | |
| 'mar': 'mr', 'marathi': 'mr', | |
| 'guj': 'gu', 'gujarati': 'gu', | |
| 'ori': 'or', 'odia': 'or', 'ory': 'or', | |
| 'tam': 'ta', 'tamil': 'ta', | |
| 'tel': 'te', 'telugu': 'te', | |
| 'kan': 'kn', 'kannada': 'kn', | |
| 'mal': 'ml', 'malayalam': 'ml', | |
| 'sin': 'sd', 'sindhi': 'sd', 'snd': 'sd', | |
| 'kok': 'kok', 'konkani': 'kok', | |
| 'san': 'sa', 'sanskrit': 'sa', | |
| # Common variations | |
| 'bho': 'hi', # Bhojpuri β Hindi | |
| 'mai': 'hi', # Maithili β Hindi | |
| 'mag': 'hi', # Magahi β Hindi | |
| } | |
| print("β Complete language mappings loaded") | |
| print(f"π Total dataset languages: {len(ALL_SUPPORTED_LANGS)}") | |
| print(f"π Mapping variations: {len(VOXLINGUA_TO_DATASET)}") | |
| # ============================================================================== | |
| # Cell 4: Enhanced Parsing Functions (Your Working Code + Improvements) | |
| # ============================================================================== | |
| def parse_top1(out): | |
| """Parse VoxLingua107 output - your exact working function""" | |
| logits, log_conf, pred_idx, labels = out | |
| label_str = labels[0] if (isinstance(labels, (list, tuple)) and len(labels) > 0) else "unknown" | |
| if not isinstance(label_str, str): | |
| label_str = str(label_str) | |
| colon_pos = label_str.find(":") | |
| if colon_pos != -1: | |
| iso = label_str[:colon_pos].strip() | |
| else: | |
| iso = label_str.strip() | |
| conf = float(log_conf.exp().item()) | |
| return iso, label_str, conf | |
| def parse_xlsr_output(out): | |
| """Parse XLS-R model output""" | |
| try: | |
| out_prob, score, index, text_lab = out | |
| lang_code = str(text_lab[0]).strip().lower() | |
| confidence = float(out_prob.exp().max().item()) | |
| return lang_code, confidence | |
| except Exception as e: | |
| print(f" XLS-R parsing error: {e}") | |
| return "unknown", 0.0 | |
| def map_to_dataset_language(detected_lang): | |
| """Map VoxLingua/XLS-R output to your dataset language codes""" | |
| # Direct match first | |
| if detected_lang in ALL_SUPPORTED_LANGS: | |
| return detected_lang | |
| # Check mapping dictionary | |
| mapped = VOXLINGUA_TO_DATASET.get(detected_lang.lower(), detected_lang) | |
| # If still not in dataset, try transfer mapping | |
| if mapped not in ALL_SUPPORTED_LANGS and mapped in TRANSFER_MAPPINGS: | |
| transfer_target = TRANSFER_MAPPINGS[mapped] | |
| print(f" Transfer mapping: {mapped} β {transfer_target}") | |
| return transfer_target | |
| return mapped | |
| print("β Enhanced parsing functions ready") | |
| # ============================================================================== | |
| # Cell 5: Hybrid Multi-Model Language Detection | |
| # ============================================================================== | |
| def hybrid_language_detection(audio_path): | |
| """ | |
| Multi-model ensemble language detection optimized for your dataset | |
| """ | |
| print(f" π§ Analyzing: {os.path.basename(audio_path)}") | |
| predictions = {} | |
| confidences = {} | |
| # Model 1: VoxLingua107 (Primary - 60% weight since it's your working baseline) | |
| if voxlingua_model is not None: | |
| try: | |
| out = voxlingua_model.classify_file(audio_path) | |
| pred_iso, pred_label, conf = parse_top1(out) | |
| # Map to dataset language codes | |
| mapped_lang = map_to_dataset_language(pred_iso) | |
| predictions['voxlingua'] = mapped_lang | |
| confidences['voxlingua'] = conf * 0.60 # 60% weight | |
| print(f" VoxLingua107: {pred_iso} β {mapped_lang} ({conf:.3f})") | |
| except Exception as e: | |
| print(f" VoxLingua107 error: {e}") | |
| # Model 2: XLS-R (Secondary - 40% weight) | |
| if xlsr_lid_model is not None: | |
| try: | |
| out = xlsr_lid_model.classify_file(audio_path) | |
| lang_code, conf = parse_xlsr_output(out) | |
| # Map to dataset language codes | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| predictions['xlsr'] = mapped_lang | |
| confidences['xlsr'] = conf * 0.40 # 40% weight | |
| print(f" XLS-R: {lang_code} β {mapped_lang} ({conf:.3f})") | |
| except Exception as e: | |
| print(f" XLS-R error: {e}") | |
| # Ensemble Decision Making | |
| if not predictions: | |
| return "unknown", 0.0 | |
| # Strategy 1: Check for agreement between models | |
| if len(predictions) >= 2: | |
| pred_values = list(predictions.values()) | |
| if pred_values[0] == pred_values[1]: # Models agree | |
| consensus_lang = pred_values[0] | |
| avg_confidence = sum(confidences.values()) / len(confidences) | |
| print(f" π― Consensus: {consensus_lang} (confidence: {avg_confidence:.3f})") | |
| return consensus_lang, avg_confidence | |
| # Strategy 2: Use highest weighted confidence | |
| if confidences: | |
| best_model = max(confidences.keys(), key=lambda k: confidences[k]) | |
| best_lang = predictions[best_model] | |
| best_conf = confidences[best_model] / (0.60 if best_model == 'voxlingua' else 0.40) # Normalize | |
| print(f" π― Best model ({best_model}): {best_lang} (confidence: {best_conf:.3f})") | |
| return best_lang, best_conf | |
| return "unknown", 0.0 | |
| print("β Hybrid ensemble language detection ready") | |
| # ============================================================================== | |
| # Cell 6: Complete Ground Truth Extraction for Your Dataset | |
| # ============================================================================== | |
| def gt_from_filename(path): | |
| """Extract ground truth from filename - complete version for your dataset""" | |
| name = os.path.basename(path).lower() | |
| # Pattern 1: Your working regex pattern | |
| GT_TOKEN = re.compile(r'(?:^|[_-])([a-z]{2,4})(?:[_-]|$)', re.IGNORECASE) | |
| m = GT_TOKEN.search(name) | |
| if m: | |
| code = m.group(1).lower() | |
| # Complete mapping based on your dataset structure | |
| filename_mappings = { | |
| # Your working mappings | |
| "guf": "gu", "mrt": "mr", "ml": "ml", | |
| # Additional mappings for your complete dataset | |
| "urd": "ur", "urdu": "ur", | |
| "pan": "pa", "punjabi": "pa", "pnb": "pa", | |
| "hin": "hi", "hindi": "hi", | |
| "ben": "bn", "bengali": "bn", "bng": "bn", | |
| "nep": "ne", "nepali": "ne", | |
| "asm": "as", "assamese": "as", | |
| "kas": "ks", "kashmiri": "ks", | |
| "mar": "mr", "marathi": "mr", | |
| "guj": "gu", "gujarati": "gu", | |
| "ori": "or", "odia": "or", "ory": "or", | |
| "tam": "ta", "tamil": "ta", | |
| "tel": "te", "telugu": "te", | |
| "kan": "kn", "kannada": "kn", | |
| "mal": "ml", "malayalam": "ml", | |
| "sin": "sd", "sindhi": "sd", "snd": "sd", | |
| "kok": "kok", "konkani": "kok", | |
| "bod": "br", "bodo": "br", # Bodo variations | |
| "dog": "doi", "dogri": "doi", | |
| "sat": "sat", "santali": "sat", | |
| "mni": "mni", "manipuri": "mni", | |
| "san": "sa", "sanskrit": "sa", | |
| } | |
| mapped_code = filename_mappings.get(code, code) | |
| # Validate against your dataset languages | |
| if mapped_code in ALL_SUPPORTED_LANGS: | |
| return mapped_code | |
| # Pattern 2: Check folder structure | |
| path_parts = path.split('/') | |
| for part in path_parts: | |
| part_lower = part.lower() | |
| if part_lower in ALL_SUPPORTED_LANGS: | |
| return part_lower | |
| # Check if it's a language name folder | |
| for full_name, code in [('gujarati', 'gu'), ('marathi', 'mr'), ('hindi', 'hi'), | |
| ('bengali', 'bn'), ('tamil', 'ta'), ('telugu', 'te'), | |
| ('kannada', 'kn'), ('malayalam', 'ml'), ('punjabi', 'pa'), | |
| ('urdu', 'ur'), ('assamese', 'as'), ('odia', 'or'), | |
| ('nepali', 'ne'), ('kashmiri', 'ks'), ('sindhi', 'sd'), | |
| ('konkani', 'kok'), ('bodo', 'br'), ('dogri', 'doi'), | |
| ('santali', 'sat'), ('manipuri', 'mni'), ('sanskrit', 'sa')]: | |
| if full_name in part_lower: | |
| return code | |
| return None | |
| print("β Complete ground truth extraction ready") | |
| # ============================================================================== | |
| # Cell 7: Google Drive Processing with Error Handling | |
| # ============================================================================== | |
| def download_and_process_drive_dataset(): | |
| """Download and process with robust error handling""" | |
| print("π Processing Google Drive dataset...") | |
| # Get sharing link | |
| share_link = input("π Enter Google Drive sharing link: ").strip() | |
| if not share_link: | |
| print("β No link provided") | |
| return [] | |
| # Extract file ID | |
| def extract_file_id(link): | |
| patterns = [r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)', r'/file/d/([a-zA-Z0-9-_]+)'] | |
| for pattern in patterns: | |
| match = re.search(pattern, link) | |
| if match: | |
| return match.group(1) | |
| return None | |
| file_id = extract_file_id(share_link) | |
| if not file_id: | |
| print("β Could not extract file ID from sharing link") | |
| return [] | |
| # Setup download directory | |
| download_dir = "/content/drive_dataset" | |
| if os.path.exists(download_dir): | |
| import shutil | |
| shutil.rmtree(download_dir) | |
| os.makedirs(download_dir, exist_ok=True) | |
| # Download with error handling | |
| try: | |
| import gdown | |
| print(f"π₯ Downloading from Google Drive (ID: {file_id})...") | |
| gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", | |
| output=download_dir, quiet=False, use_cookies=False) | |
| print("β Download completed successfully") | |
| except Exception as e: | |
| print(f"β Download failed: {e}") | |
| print("π‘ Make sure the folder is shared with 'Anyone with the link can view'") | |
| return [] | |
| # Scan for audio files | |
| VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"} | |
| def is_audio(filepath): | |
| return os.path.splitext(filepath)[1].lower() in VALID_EXTS | |
| print("π Scanning for audio files...") | |
| all_files = [] | |
| for root, dirs, files in os.walk(download_dir): | |
| for file in files: | |
| if is_audio(file): | |
| full_path = os.path.join(root, file) | |
| all_files.append(full_path) | |
| print(f"π Found {len(all_files)} total audio files") | |
| # Filter and limit files | |
| filtered_files = [] | |
| lang_counts = {} | |
| english_skipped = 0 | |
| for file_path in all_files: | |
| # Skip English files | |
| if any(eng_indicator in file_path.lower() for eng_indicator in | |
| ['english', '_en_', '/en/', 'eng_', '_eng']): | |
| english_skipped += 1 | |
| continue | |
| # Extract language for limiting | |
| gt_lang = gt_from_filename(file_path) | |
| if gt_lang: | |
| lang_counts[gt_lang] = lang_counts.get(gt_lang, 0) | |
| if lang_counts[gt_lang] < 5: # Max 5 per language | |
| filtered_files.append(file_path) | |
| lang_counts[gt_lang] += 1 | |
| else: | |
| # Include files without clear language markers (up to overall limit) | |
| if len(filtered_files) < 50: | |
| filtered_files.append(file_path) | |
| print(f"π Filtered results:") | |
| print(f" English files skipped: {english_skipped}") | |
| print(f" Selected for processing: {len(filtered_files)}") | |
| for lang, count in sorted(lang_counts.items()): | |
| print(f" {lang}: {count} files") | |
| return filtered_files | |
| # Execute download and processing | |
| test_files = download_and_process_drive_dataset() | |
| print(f"\nπ― Total files ready for language detection: {len(test_files)}") | |
| # ============================================================================== | |
| # Cell 8: Execute Language Detection Analysis | |
| # ============================================================================== | |
| def run_language_detection_analysis(audio_files): | |
| """Run complete language detection analysis""" | |
| if not audio_files: | |
| print("β No audio files to process") | |
| return | |
| print(f"π Starting language detection on {len(audio_files)} files...") | |
| print("=" * 60) | |
| results = [] | |
| for i, audio_path in enumerate(audio_files, 1): | |
| print(f"\n[{i}/{len(audio_files)}] Processing: {os.path.basename(audio_path)}") | |
| try: | |
| # Extract ground truth | |
| gt_iso = gt_from_filename(audio_path) | |
| # Run hybrid detection | |
| pred_iso, confidence = hybrid_language_detection(audio_path) | |
| # Determine correctness | |
| is_correct = (gt_iso == pred_iso) if gt_iso else None | |
| result = { | |
| "file": os.path.basename(audio_path), | |
| "full_path": audio_path, | |
| "gt_iso": gt_iso if gt_iso else "", | |
| "pred_iso": pred_iso, | |
| "confidence": confidence, | |
| "correct": is_correct | |
| } | |
| results.append(result) | |
| # Status display | |
| status = "β " if is_correct else "β" if is_correct is False else "β" | |
| print(f" {status} GT: {gt_iso or 'Unknown'} | Pred: {pred_iso} | Conf: {confidence:.3f}") | |
| except Exception as e: | |
| print(f" π₯ Error processing file: {e}") | |
| results.append({ | |
| "file": os.path.basename(audio_path), | |
| "full_path": audio_path, | |
| "gt_iso": "", | |
| "pred_iso": "error", | |
| "confidence": 0.0, | |
| "correct": False | |
| }) | |
| return results | |
| # Run the analysis | |
| analysis_results = run_language_detection_analysis(test_files) | |
| print(f"\nπ Language detection analysis complete!") | |
| print(f"π Total results: {len(analysis_results)}") | |
| # ============================================================================== | |
| # Cell 9: Complete Results Analysis and Accuracy Report | |
| # ============================================================================== | |
| def generate_comprehensive_analysis(results): | |
| """Generate complete analysis matching your dataset format""" | |
| df = pd.DataFrame(results) | |
| # Filter to files with ground truth from your dataset | |
| valid_df = df[(df["gt_iso"] != "") & (df["gt_iso"].isin(ALL_SUPPORTED_LANGS))].copy() | |
| if len(valid_df) == 0: | |
| print("β No valid ground truth files found") | |
| return | |
| print("π COMPREHENSIVE LANGUAGE DETECTION ANALYSIS") | |
| print("=" * 60) | |
| # Overall accuracy | |
| overall_acc = accuracy_score(valid_df["gt_iso"], valid_df["pred_iso"]) | |
| print(f"π― OVERALL ACCURACY: {overall_acc:.4f} ({overall_acc*100:.1f}%)") | |
| # Create accuracy table matching your format | |
| print(f"\nπ LANGUAGE-WISE ACCURACY:") | |
| print("-" * 60) | |
| print("Code | Language Name | Files | Top-1 | Top-5 | Conf") | |
| print("-" * 60) | |
| # Language name mapping | |
| LANG_NAMES = { | |
| 'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi', | |
| 'or': 'Odia', 'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese', | |
| 'hi': 'Hindi', 'bn': 'Bengali', 'kok': 'Konkani', 'kn': 'Kannada', | |
| 'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu', 'br': 'Bodo', | |
| 'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mai': 'Maithili', | |
| 'mni': 'Manipuri', 'sa': 'Sanskrit' | |
| } | |
| # Calculate per-language statistics | |
| lang_stats = [] | |
| for lang_code in sorted(valid_df["gt_iso"].unique()): | |
| lang_data = valid_df[valid_df["gt_iso"] == lang_code] | |
| total_files = len(lang_data) | |
| correct_pred = (lang_data["gt_iso"] == lang_data["pred_iso"]).sum() | |
| accuracy = correct_pred / total_files | |
| avg_conf = lang_data["confidence"].mean() | |
| lang_name = LANG_NAMES.get(lang_code, lang_code.title()) | |
| # Format output to match your table | |
| print(f"{lang_code:>3s} | {lang_name:<15s} | {total_files:>5d} | {accuracy*100:>5.1f}% | {accuracy*100:>5.1f}% | {avg_conf:>5.3f}") | |
| lang_stats.append({ | |
| 'code': lang_code, | |
| 'name': lang_name, | |
| 'files': total_files, | |
| 'accuracy': accuracy, | |
| 'confidence': avg_conf | |
| }) | |
| print("-" * 60) | |
| # Language family analysis | |
| print(f"\nπ LANGUAGE FAMILY PERFORMANCE:") | |
| print("-" * 40) | |
| family_stats = {} | |
| for _, row in valid_df.iterrows(): | |
| lang = row['gt_iso'] | |
| correct = row['correct'] | |
| if lang in INDO_ARYAN_LANGS: | |
| family = 'Indo-Aryan' | |
| elif lang in DRAVIDIAN_LANGS: | |
| family = 'Dravidian' | |
| elif lang in LOW_RESOURCE_LANGS: | |
| family = 'Low-Resource' | |
| else: | |
| family = 'Other' | |
| if family not in family_stats: | |
| family_stats[family] = {'correct': 0, 'total': 0} | |
| family_stats[family]['total'] += 1 | |
| if correct: | |
| family_stats[family]['correct'] += 1 | |
| for family, stats in family_stats.items(): | |
| acc_pct = (stats['correct'] / stats['total']) * 100 | |
| print(f"{family:<15s}: {acc_pct:>5.1f}% ({stats['correct']:>2d}/{stats['total']:>2d})") | |
| # Model performance analysis | |
| print(f"\nπ MODEL PERFORMANCE:") | |
| print("-" * 30) | |
| print(f"Models loaded: {models_loaded}/2") | |
| print(f"VoxLingua107: {'β Active' if voxlingua_model else 'β Failed'}") | |
| print(f"XLS-R: {'β Active' if xlsr_lid_model else 'β Failed'}") | |
| # Error analysis | |
| errors = valid_df[valid_df["gt_iso"] != valid_df["pred_iso"]] | |
| if len(errors) > 0: | |
| print(f"\nβ MISCLASSIFICATION ANALYSIS ({len(errors)} errors):") | |
| print("-" * 50) | |
| # Group errors by actual language | |
| for actual_lang in sorted(errors["gt_iso"].unique()): | |
| lang_errors = errors[errors["gt_iso"] == actual_lang] | |
| predicted_langs = lang_errors["pred_iso"].value_counts() | |
| print(f"{actual_lang} ({LANG_NAMES.get(actual_lang, actual_lang)}):") | |
| for pred_lang, count in predicted_langs.head(3).items(): | |
| print(f" β {pred_lang} ({count} files)") | |
| # Summary statistics | |
| print(f"\nπ SUMMARY STATISTICS:") | |
| print("-" * 25) | |
| print(f"Total files processed: {len(df)}") | |
| print(f"Files with valid GT: {len(valid_df)}") | |
| print(f"Languages detected: {len(valid_df['pred_iso'].unique())}") | |
| print(f"Languages in dataset: {len(valid_df['gt_iso'].unique())}") | |
| print(f"Perfect accuracy: {len([l for l in lang_stats if l['accuracy'] == 1.0])}") | |
| print(f"Above 90% accuracy: {len([l for l in lang_stats if l['accuracy'] >= 0.9])}") | |
| print(f"Below 50% accuracy: {len([l for l in lang_stats if l['accuracy'] < 0.5])}") | |
| return valid_df, lang_stats | |
| # Run comprehensive analysis | |
| if 'analysis_results' in globals() and analysis_results: | |
| final_df, language_statistics = generate_comprehensive_analysis(analysis_results) | |
| # Save results to CSV | |
| if 'final_df' in locals(): | |
| timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") | |
| csv_filename = f"language_detection_results_{timestamp}.csv" | |
| final_df.to_csv(csv_filename, index=False) | |
| print(f"\nπΎ Results saved to: {csv_filename}") | |
| # Download file | |
| try: | |
| from google.colab import files | |
| print("π₯ File downloaded successfully") | |
| except: | |
| print("π File saved locally (download failed)") | |
| else: | |
| print("β No analysis results available. Please run the previous cells first.") | |
| print(f"\nβ COMPLETE LANGUAGE DETECTION ANALYSIS FINISHED!") | |
| # ============================================================================== | |
| # Independent Model Analysis with Top-5 and Real Confidence Scores | |
| # ============================================================================== | |
| def analyze_models_independently(audio_files): | |
| """Analyze each model independently with Top-5 predictions and real confidence scores""" | |
| print("π INDEPENDENT MODEL ANALYSIS") | |
| print("=" * 60) | |
| results = { | |
| 'voxlingua': [], | |
| 'xlsr': [], | |
| 'combined_analysis': [] | |
| } | |
| for i, audio_path in enumerate(audio_files, 1): | |
| print(f"\n[{i}/{len(audio_files)}] Analyzing: {os.path.basename(audio_path)}") | |
| # Extract ground truth | |
| gt_iso = gt_from_filename(audio_path) | |
| print(f" Ground Truth: {gt_iso or 'Unknown'}") | |
| file_result = { | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'voxlingua_results': {}, | |
| 'xlsr_results': {} | |
| } | |
| # ======================================== | |
| # VoxLingua107 Independent Analysis | |
| # ======================================== | |
| if voxlingua_model is not None: | |
| try: | |
| print(f" π¬ VoxLingua107 Analysis:") | |
| out = voxlingua_model.classify_file(audio_path) | |
| # Extract Top-5 predictions with real confidence scores | |
| logits, log_conf, pred_idx, labels = out | |
| # Get top 5 predictions | |
| top5_indices = torch.topk(logits.squeeze(), 5).indices | |
| top5_probs = torch.softmax(logits.squeeze(), dim=0) | |
| vox_top5 = [] | |
| for idx in top5_indices: | |
| lang_label = labels[idx.item()] if idx.item() < len(labels) else f"idx_{idx.item()}" | |
| prob = top5_probs[idx.item()].item() | |
| # Extract language code | |
| if isinstance(lang_label, str): | |
| colon_pos = lang_label.find(":") | |
| lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() | |
| else: | |
| lang_code = str(lang_label) | |
| # Map to dataset codes | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| vox_top5.append({ | |
| 'rank': len(vox_top5) + 1, | |
| 'original_code': lang_code, | |
| 'mapped_code': mapped_lang, | |
| 'confidence': prob, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| print(f" Rank {len(vox_top5)}: {lang_code} β {mapped_lang} ({prob:.4f}) {'β ' if mapped_lang in ALL_SUPPORTED_LANGS else 'β'}") | |
| # Store VoxLingua results | |
| file_result['voxlingua_results'] = { | |
| 'top5': vox_top5, | |
| 'top1_original': vox_top5[0]['original_code'], | |
| 'top1_mapped': vox_top5[0]['mapped_code'], | |
| 'top1_confidence': vox_top5[0]['confidence'], | |
| 'correct_in_top1': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None, | |
| 'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in vox_top5) if gt_iso else None | |
| } | |
| results['voxlingua'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': vox_top5[0]['mapped_code'], | |
| 'confidence': vox_top5[0]['confidence'], | |
| 'correct': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None, | |
| 'top5_predictions': [p['mapped_code'] for p in vox_top5] | |
| }) | |
| except Exception as e: | |
| print(f" β VoxLingua107 error: {e}") | |
| file_result['voxlingua_results'] = {'error': str(e)} | |
| # ======================================== | |
| # XLS-R Independent Analysis | |
| # ======================================== | |
| if xlsr_lid_model is not None: | |
| try: | |
| print(f" π¬ XLS-R Analysis:") | |
| out = xlsr_lid_model.classify_file(audio_path) | |
| # Parse XLS-R output for Top-5 | |
| out_prob, score, index, text_lab = out | |
| # Get top 5 predictions | |
| top5_indices = torch.topk(out_prob.squeeze(), 5).indices | |
| top5_probs = torch.softmax(out_prob.squeeze(), dim=0) | |
| xlsr_top5 = [] | |
| for idx in top5_indices: | |
| lang_label = text_lab[idx.item()] if idx.item() < len(text_lab) else f"idx_{idx.item()}" | |
| prob = top5_probs[idx.item()].item() | |
| lang_code = str(lang_label).strip().lower() | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| xlsr_top5.append({ | |
| 'rank': len(xlsr_top5) + 1, | |
| 'original_code': lang_code, | |
| 'mapped_code': mapped_lang, | |
| 'confidence': prob, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| print(f" Rank {len(xlsr_top5)}: {lang_code} β {mapped_lang} ({prob:.4f}) {'β ' if mapped_lang in ALL_SUPPORTED_LANGS else 'β'}") | |
| # Store XLS-R results | |
| file_result['xlsr_results'] = { | |
| 'top5': xlsr_top5, | |
| 'top1_original': xlsr_top5[0]['original_code'], | |
| 'top1_mapped': xlsr_top5[0]['mapped_code'], | |
| 'top1_confidence': xlsr_top5[0]['confidence'], | |
| 'correct_in_top1': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None, | |
| 'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in xlsr_top5) if gt_iso else None | |
| } | |
| results['xlsr'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': xlsr_top5[0]['mapped_code'], | |
| 'confidence': xlsr_top5[0]['confidence'], | |
| 'correct': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None, | |
| 'top5_predictions': [p['mapped_code'] for p in xlsr_top5] | |
| }) | |
| except Exception as e: | |
| print(f" β XLS-R error: {e}") | |
| file_result['xlsr_results'] = {'error': str(e)} | |
| results['combined_analysis'].append(file_result) | |
| print(f" β Analysis complete for {os.path.basename(audio_path)}") | |
| return results | |
| def generate_independent_model_report(results): | |
| """Generate comprehensive independent model analysis report""" | |
| print(f"\nπ INDEPENDENT MODEL PERFORMANCE ANALYSIS") | |
| print("=" * 70) | |
| # VoxLingua107 Analysis | |
| if results['voxlingua']: | |
| vox_df = pd.DataFrame(results['voxlingua']) | |
| valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() | |
| if len(valid_vox) > 0: | |
| vox_acc = accuracy_score(valid_vox['gt_iso'], valid_vox['pred_iso']) | |
| vox_conf_avg = valid_vox['confidence'].mean() | |
| vox_conf_std = valid_vox['confidence'].std() | |
| print(f"\n㪠VoxLingua107 INDEPENDENT ANALYSIS:") | |
| print(f" Files analyzed: {len(valid_vox)}") | |
| print(f" Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)") | |
| print(f" Avg Confidence: {vox_conf_avg:.4f} Β± {vox_conf_std:.4f}") | |
| # Per-language accuracy for VoxLingua | |
| print(f" Per-language performance:") | |
| vox_per_lang = valid_vox.groupby('gt_iso').agg({ | |
| 'correct': 'mean', | |
| 'confidence': ['mean', 'count'] | |
| }).round(4) | |
| vox_per_lang.columns = ['accuracy', 'avg_conf', 'count'] | |
| for lang, row in vox_per_lang.iterrows(): | |
| print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files") | |
| # XLS-R Analysis | |
| if results['xlsr']: | |
| xlsr_df = pd.DataFrame(results['xlsr']) | |
| valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() | |
| if len(valid_xlsr) > 0: | |
| xlsr_acc = accuracy_score(valid_xlsr['gt_iso'], valid_xlsr['pred_iso']) | |
| xlsr_conf_avg = valid_xlsr['confidence'].mean() | |
| xlsr_conf_std = valid_xlsr['confidence'].std() | |
| print(f"\n㪠XLS-R INDEPENDENT ANALYSIS:") | |
| print(f" Files analyzed: {len(valid_xlsr)}") | |
| print(f" Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)") | |
| print(f" Avg Confidence: {xlsr_conf_avg:.4f} Β± {xlsr_conf_std:.4f}") | |
| # Per-language accuracy for XLS-R | |
| print(f" Per-language performance:") | |
| xlsr_per_lang = valid_xlsr.groupby('gt_iso').agg({ | |
| 'correct': 'mean', | |
| 'confidence': ['mean', 'count'] | |
| }).round(4) | |
| xlsr_per_lang.columns = ['accuracy', 'avg_conf', 'count'] | |
| for lang, row in xlsr_per_lang.iterrows(): | |
| print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files") | |
| # Model Comparison | |
| if results['voxlingua'] and results['xlsr']: | |
| print(f"\nβοΈ MODEL COMPARISON:") | |
| print(f" VoxLingua107 vs XLS-R:") | |
| print(f" Accuracy: {vox_acc:.4f} vs {xlsr_acc:.4f} ({'VoxLingua wins' if vox_acc > xlsr_acc else 'XLS-R wins' if xlsr_acc > vox_acc else 'Tie'})") | |
| print(f" Avg Confidence: {vox_conf_avg:.4f} vs {xlsr_conf_avg:.4f}") | |
| # Suggest optimal weights | |
| total_perf = vox_acc + xlsr_acc | |
| vox_weight = vox_acc / total_perf if total_perf > 0 else 0.5 | |
| xlsr_weight = xlsr_acc / total_perf if total_perf > 0 else 0.5 | |
| print(f"\nπ‘ SUGGESTED OPTIMAL WEIGHTS:") | |
| print(f" VoxLingua107: {vox_weight:.2f} ({vox_weight*100:.0f}%)") | |
| print(f" XLS-R: {xlsr_weight:.2f} ({xlsr_weight*100:.0f}%)") | |
| return results | |
| # Run independent analysis | |
| if 'test_files' in globals() and test_files: | |
| independent_results = analyze_models_independently(test_files[:10]) # Limit to first 10 for testing | |
| final_report = generate_independent_model_report(independent_results) | |
| else: | |
| print("β No test files available. Run the previous cells first.") | |
| # ============================================================================== | |
| # Analyze Already Downloaded Files in /content/drive_dataset/ | |
| # ============================================================================== | |
| def scan_downloaded_files(): | |
| """Scan and collect already downloaded audio files""" | |
| download_dir = "/content/drive_dataset" | |
| if not os.path.exists(download_dir): | |
| print("β Download directory not found") | |
| return [] | |
| print(f"π Scanning {download_dir} for audio files...") | |
| # Valid audio extensions | |
| VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"} | |
| def is_audio(filepath): | |
| return os.path.splitext(filepath)[1].lower() in VALID_EXTS | |
| # Collect all audio files | |
| audio_files = [] | |
| lang_counts = {} | |
| for root, dirs, files in os.walk(download_dir): | |
| for file in files: | |
| if is_audio(file): | |
| full_path = os.path.join(root, file) | |
| audio_files.append(full_path) | |
| # Extract language from folder structure | |
| path_parts = root.split('/') | |
| for part in path_parts: | |
| if len(part) in [2, 3] and part.isalpha(): | |
| lang_counts[part] = lang_counts.get(part, 0) + 1 | |
| break | |
| print(f"π Found {len(audio_files)} audio files:") | |
| for lang, count in sorted(lang_counts.items()): | |
| print(f" {lang}: {count} files") | |
| # Show sample files | |
| print(f"\nπ Sample files:") | |
| for file_path in audio_files[:5]: | |
| print(f" {file_path}") | |
| return audio_files | |
| # Scan for downloaded files | |
| downloaded_files = scan_downloaded_files() | |
| if not downloaded_files: | |
| print("β No audio files found. Let me help you collect them manually.") | |
| # Manual file collection if scan fails | |
| print("\nπ Manual file search...") | |
| import glob | |
| # Search patterns for common locations | |
| search_patterns = [ | |
| "/content/drive_dataset/**/*.flac", | |
| "/content/drive_dataset/**/*.wav", | |
| "/content/drive_dataset/**/*.mp3", | |
| "/content/**/*.flac", | |
| "/content/**/*.wav", | |
| "/content/**/*.mp3" | |
| ] | |
| manual_files = [] | |
| for pattern in search_patterns: | |
| found = glob.glob(pattern, recursive=True) | |
| manual_files.extend(found) | |
| # Remove duplicates | |
| manual_files = list(set(manual_files)) | |
| print(f"π Manual search found: {len(manual_files)} files") | |
| for file_path in manual_files[:10]: # Show first 10 | |
| print(f" {file_path}") | |
| downloaded_files = manual_files | |
| print(f"\nπ― Total files ready for analysis: {len(downloaded_files)}") | |
| # ============================================================================== | |
| # Run Independent Analysis on Downloaded Files | |
| # ============================================================================== | |
| def analyze_downloaded_files_independently(audio_files): | |
| """Run independent model analysis on downloaded files with detailed output""" | |
| if not audio_files: | |
| print("β No audio files to analyze") | |
| return None | |
| print(f"π Starting independent analysis on {len(audio_files)} files...") | |
| print("=" * 70) | |
| results = { | |
| 'voxlingua_detailed': [], | |
| 'xlsr_detailed': [], | |
| 'comparison_data': [] | |
| } | |
| for i, audio_path in enumerate(audio_files, 1): | |
| print(f"\n[{i}/{len(audio_files)}] π΅ {os.path.basename(audio_path)}") | |
| # Extract ground truth from path/filename | |
| gt_iso = gt_from_filename(audio_path) | |
| print(f" π Ground Truth: {gt_iso or 'Unknown'}") | |
| file_analysis = { | |
| 'file': os.path.basename(audio_path), | |
| 'full_path': audio_path, | |
| 'gt_iso': gt_iso or '', | |
| 'voxlingua': {'available': False}, | |
| 'xlsr': {'available': False} | |
| } | |
| # ========================================== | |
| # VoxLingua107 Independent Analysis | |
| # ========================================== | |
| if voxlingua_model is not None: | |
| try: | |
| print(f" π¬ VoxLingua107 Analysis:") | |
| out = voxlingua_model.classify_file(audio_path) | |
| logits, log_conf, pred_idx, labels = out | |
| # Get real confidence scores (not weighted) | |
| probs = torch.softmax(logits.squeeze(), dim=0) | |
| top5_indices = torch.topk(probs, min(5, len(probs))).indices | |
| vox_predictions = [] | |
| for rank, idx in enumerate(top5_indices, 1): | |
| lang_label = labels[idx.item()] | |
| confidence = probs[idx.item()].item() | |
| # Parse language code | |
| if isinstance(lang_label, str): | |
| colon_pos = lang_label.find(":") | |
| lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() | |
| else: | |
| lang_code = str(lang_label) | |
| # Map to dataset language | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| vox_predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': confidence, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({confidence:.4f}) {status}") | |
| # Store VoxLingua results | |
| top1 = vox_predictions[0] | |
| file_analysis['voxlingua'] = { | |
| 'available': True, | |
| 'top5_predictions': vox_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| 'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None | |
| } | |
| results['voxlingua_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| except Exception as e: | |
| print(f" β VoxLingua107 error: {e}") | |
| file_analysis['voxlingua'] = {'available': False, 'error': str(e)} | |
| # ========================================== | |
| # XLS-R Independent Analysis | |
| # ========================================== | |
| if xlsr_lid_model is not None: | |
| try: | |
| print(f" π¬ XLS-R Analysis:") | |
| out = xlsr_lid_model.classify_file(audio_path) | |
| out_prob, score, index, text_lab = out | |
| # Get real confidence scores | |
| probs = torch.softmax(out_prob.squeeze(), dim=0) | |
| top5_indices = torch.topk(probs, min(5, len(probs))).indices | |
| xlsr_predictions = [] | |
| for rank, idx in enumerate(top5_indices, 1): | |
| lang_label = text_lab[idx.item()] | |
| confidence = probs[idx.item()].item() | |
| lang_code = str(lang_label).strip().lower() | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| xlsr_predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': confidence, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({confidence:.4f}) {status}") | |
| # Store XLS-R results | |
| top1 = xlsr_predictions[0] | |
| file_analysis['xlsr'] = { | |
| 'available': True, | |
| 'top5_predictions': xlsr_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| 'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None | |
| } | |
| results['xlsr_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| except Exception as e: | |
| print(f" β XLS-R error: {e}") | |
| file_analysis['xlsr'] = {'available': False, 'error': str(e)} | |
| results['comparison_data'].append(file_analysis) | |
| print(f" β Analysis complete\n") | |
| return results | |
| # Run the independent analysis | |
| if downloaded_files: | |
| print("π¬ Running independent model analysis...") | |
| analysis_results = analyze_downloaded_files_independently(downloaded_files) | |
| else: | |
| print("β No files found for analysis") | |
| analysis_results = None | |
| # ============================================================================== | |
| # FIXED: Robust VoxLingua107 Analysis with Better Error Handling | |
| # ============================================================================== | |
| def parse_voxlingua_output_robust(out): | |
| """Robust parsing of VoxLingua107 output with multiple fallback methods""" | |
| try: | |
| # Method 1: Standard SpeechBrain output format | |
| if isinstance(out, (tuple, list)) and len(out) >= 4: | |
| logits, log_conf, pred_idx, labels = out[:4] | |
| # Validate components | |
| if hasattr(logits, 'squeeze') and hasattr(labels, '__getitem__'): | |
| return logits, log_conf, pred_idx, labels, "standard" | |
| # Method 2: Alternative format (sometimes returns dict) | |
| if isinstance(out, dict): | |
| logits = out.get('predictions', out.get('logits')) | |
| labels = out.get('labels', out.get('text_lab')) | |
| log_conf = out.get('log_probabilities', out.get('log_conf')) | |
| pred_idx = out.get('predicted_ids', out.get('pred_idx')) | |
| if all(v is not None for v in [logits, labels]): | |
| return logits, log_conf, pred_idx, labels, "dict" | |
| # Method 3: Direct tensor output | |
| if hasattr(out, 'squeeze'): # Direct logits tensor | |
| logits = out | |
| # Create dummy labels based on logits size | |
| labels = [f"lang_{i}" for i in range(logits.shape[-1])] | |
| log_conf = torch.log_softmax(logits, dim=-1).max() | |
| pred_idx = torch.argmax(logits, dim=-1) | |
| return logits, log_conf, pred_idx, labels, "tensor" | |
| except Exception as e: | |
| print(f" Parse error: {e}") | |
| return None, None, None, None, "failed" | |
| def analyze_voxlingua_robust(audio_path): | |
| """Robust VoxLingua107 analysis with multiple parsing methods""" | |
| if voxlingua_model is None: | |
| return None | |
| try: | |
| # Get raw output from model | |
| raw_out = voxlingua_model.classify_file(audio_path) | |
| # Parse with robust method | |
| logits, log_conf, pred_idx, labels, parse_method = parse_voxlingua_output_robust(raw_out) | |
| if logits is None: | |
| print(f" β Could not parse VoxLingua output format") | |
| return None | |
| print(f" π Parse method: {parse_method}") | |
| # Get predictions based on available data | |
| if hasattr(logits, 'squeeze'): | |
| probs = torch.softmax(logits.squeeze(), dim=-1 if len(logits.squeeze().shape) > 0 else 0) | |
| # Handle different tensor shapes | |
| if len(probs.shape) == 0: # Scalar | |
| top_indices = torch.tensor([0]) | |
| top_probs = probs.unsqueeze(0) | |
| else: # Vector | |
| k = min(5, len(probs)) | |
| top_probs, top_indices = torch.topk(probs, k) | |
| else: | |
| print(f" β Logits not in expected tensor format") | |
| return None | |
| predictions = [] | |
| for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): | |
| idx_val = idx.item() if hasattr(idx, 'item') else int(idx) | |
| prob_val = prob.item() if hasattr(prob, 'item') else float(prob) | |
| # Get language label safely | |
| if idx_val < len(labels): | |
| lang_label = labels[idx_val] | |
| else: | |
| lang_label = f"unknown_{idx_val}" | |
| # Parse language code | |
| if isinstance(lang_label, str): | |
| colon_pos = lang_label.find(":") | |
| lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() | |
| else: | |
| lang_code = str(lang_label) | |
| # Map to dataset language | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': prob_val, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({prob_val:.4f}) {status}") | |
| return predictions | |
| except Exception as e: | |
| print(f" β VoxLingua analysis error: {e}") | |
| print(f" β Error type: {type(e).__name__}") | |
| return None | |
| def analyze_xlsr_robust(audio_path): | |
| """Robust XLS-R analysis""" | |
| if xlsr_lid_model is None: | |
| return None | |
| try: | |
| raw_out = xlsr_lid_model.classify_file(audio_path) | |
| # Handle different XLS-R output formats | |
| if isinstance(raw_out, (tuple, list)) and len(raw_out) >= 4: | |
| out_prob, score, index, text_lab = raw_out[:4] | |
| else: | |
| print(f" β XLS-R output format not recognized") | |
| return None | |
| # Get top predictions | |
| if hasattr(out_prob, 'squeeze'): | |
| probs = torch.softmax(out_prob.squeeze(), dim=-1 if len(out_prob.squeeze().shape) > 0 else 0) | |
| if len(probs.shape) == 0: # Scalar | |
| top_indices = torch.tensor([0]) | |
| top_probs = probs.unsqueeze(0) | |
| else: # Vector | |
| k = min(5, len(probs)) | |
| top_probs, top_indices = torch.topk(probs, k) | |
| else: | |
| print(f" β XLS-R probabilities not in expected format") | |
| return None | |
| predictions = [] | |
| for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): | |
| idx_val = idx.item() if hasattr(idx, 'item') else int(idx) | |
| prob_val = prob.item() if hasattr(prob, 'item') else float(prob) | |
| # Get language label | |
| if idx_val < len(text_lab): | |
| lang_label = text_lab[idx_val] | |
| else: | |
| lang_label = f"unknown_{idx_val}" | |
| lang_code = str(lang_label).strip().lower() | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': prob_val, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({prob_val:.4f}) {status}") | |
| return predictions | |
| except Exception as e: | |
| print(f" β XLS-R analysis error: {e}") | |
| return None | |
| # ============================================================================== | |
| # UPDATED: Robust Analysis Function | |
| # ============================================================================== | |
| def analyze_downloaded_files_robust(audio_files): | |
| """Robust analysis with better error handling""" | |
| if not audio_files: | |
| print("β No audio files to analyze") | |
| return None | |
| print(f"π Starting ROBUST analysis on {len(audio_files)} files...") | |
| print("=" * 70) | |
| results = { | |
| 'voxlingua_detailed': [], | |
| 'xlsr_detailed': [], | |
| 'comparison_data': [] | |
| } | |
| for i, audio_path in enumerate(audio_files, 1): | |
| print(f"\n[{i}/{len(audio_files)}] π΅ {os.path.basename(audio_path)}") | |
| # Extract ground truth | |
| gt_iso = gt_from_filename(audio_path) | |
| print(f" π Ground Truth: {gt_iso or 'Unknown'}") | |
| file_analysis = { | |
| 'file': os.path.basename(audio_path), | |
| 'full_path': audio_path, | |
| 'gt_iso': gt_iso or '', | |
| 'voxlingua': {'available': False}, | |
| 'xlsr': {'available': False} | |
| } | |
| # VoxLingua107 Analysis | |
| print(f" π¬ VoxLingua107 Analysis:") | |
| vox_predictions = analyze_voxlingua_robust(audio_path) | |
| if vox_predictions: | |
| top1 = vox_predictions[0] | |
| file_analysis['voxlingua'] = { | |
| 'available': True, | |
| 'top5_predictions': vox_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| 'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None | |
| } | |
| results['voxlingua_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| else: | |
| file_analysis['voxlingua'] = {'available': False, 'error': 'Analysis failed'} | |
| # XLS-R Analysis | |
| print(f" π¬ XLS-R Analysis:") | |
| xlsr_predictions = analyze_xlsr_robust(audio_path) | |
| if xlsr_predictions: | |
| top1 = xlsr_predictions[0] | |
| file_analysis['xlsr'] = { | |
| 'available': True, | |
| 'top5_predictions': xlsr_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| 'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None | |
| } | |
| results['xlsr_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| else: | |
| file_analysis['xlsr'] = {'available': False, 'error': 'Analysis failed'} | |
| results['comparison_data'].append(file_analysis) | |
| print(f" β Analysis complete") | |
| return results | |
| # Run the robust analysis | |
| if 'downloaded_files' in globals() and downloaded_files: | |
| print("π¬ Running ROBUST independent model analysis...") | |
| robust_analysis_results = analyze_downloaded_files_robust(downloaded_files) | |
| # Generate report | |
| if robust_analysis_results: | |
| generate_detailed_performance_report(robust_analysis_results) | |
| print(f"\nβ ROBUST ANALYSIS COMPLETE!") | |
| else: | |
| print("β Robust analysis failed") | |
| else: | |
| print("β No downloaded files found. Please run the file scanning code first.") | |
| # ============================================================================== | |
| # COMPLETE FIX: VoxLingua Label Mapping + Missing Function | |
| # ============================================================================== | |
| # First, let's create a proper VoxLingua language mapping | |
| VOXLINGUA_LANGUAGE_MAP = { | |
| 0: 'ab', 1: 'af', 2: 'ak', 3: 'am', 4: 'ar', 5: 'as', 6: 'az', 7: 'be', 8: 'bg', 9: 'bn', | |
| 10: 'bo', 11: 'br', 12: 'bs', 13: 'ca', 14: 'ce', 15: 'co', 16: 'cs', 17: 'cv', 18: 'cy', 19: 'da', | |
| 20: 'de', 21: 'dv', 22: 'dz', 23: 'ee', 24: 'el', 25: 'en', 26: 'eo', 27: 'es', 28: 'et', 29: 'eu', | |
| 30: 'fa', 31: 'ff', 32: 'fi', 33: 'fo', 34: 'fr', 35: 'fy', 36: 'ga', 37: 'gd', 38: 'gl', 39: 'gn', | |
| 40: 'gu', 41: 'gv', 42: 'ha', 43: 'haw', 44: 'he', 45: 'hi', 46: 'hr', 47: 'ht', 48: 'hu', 49: 'hy', | |
| 50: 'ia', 51: 'id', 52: 'ie', 53: 'ig', 54: 'ii', 55: 'ik', 56: 'io', 57: 'is', 58: 'it', 59: 'iu', | |
| 60: 'ja', 61: 'jv', 62: 'ka', 63: 'kk', 64: 'kl', 65: 'km', 66: 'kn', 67: 'ko', 68: 'ks', 69: 'ku', | |
| 70: 'kw', 71: 'ky', 72: 'la', 73: 'lb', 74: 'lg', 75: 'li', 76: 'ln', 77: 'lo', 78: 'lt', 79: 'lv', | |
| 80: 'mg', 81: 'mi', 82: 'mk', 83: 'ml', 84: 'mn', 85: 'mr', 86: 'ms', 87: 'mt', 88: 'my', 89: 'na', | |
| 90: 'nb', 91: 'nd', 92: 'ne', 93: 'ng', 94: 'nl', 95: 'nn', 96: 'no', 97: 'nv', 98: 'ny', 99: 'oc', | |
| 100: 'of', 101: 'om', 102: 'or', 103: 'os', 104: 'pa', 105: 'pi', 106: 'pl', 107: 'ps' | |
| } | |
| def get_voxlingua_language_by_index(idx): | |
| """Map VoxLingua index to language code""" | |
| return VOXLINGUA_LANGUAGE_MAP.get(idx, f'unknown_{idx}') | |
| def analyze_voxlingua_fixed(audio_path): | |
| """Fixed VoxLingua107 analysis with proper language mapping""" | |
| if voxlingua_model is None: | |
| return None | |
| try: | |
| raw_out = voxlingua_model.classify_file(audio_path) | |
| if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4: | |
| print(f" β Unexpected VoxLingua output format") | |
| return None | |
| logits, log_conf, pred_idx, labels = raw_out[:4] | |
| # Get probabilities and top 5 | |
| probs = torch.softmax(logits.squeeze(), dim=-1) | |
| k = min(5, len(probs)) | |
| top_probs, top_indices = torch.topk(probs, k) | |
| predictions = [] | |
| for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): | |
| idx_val = idx.item() if hasattr(idx, 'item') else int(idx) | |
| prob_val = prob.item() if hasattr(prob, 'item') else float(prob) | |
| # Method 1: Try to use provided labels | |
| if idx_val < len(labels) and not str(labels[idx_val]).startswith('unknown'): | |
| lang_label = labels[idx_val] | |
| if isinstance(lang_label, str): | |
| colon_pos = lang_label.find(":") | |
| lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() | |
| else: | |
| lang_code = str(lang_label) | |
| else: | |
| # Method 2: Use our language mapping | |
| lang_code = get_voxlingua_language_by_index(idx_val) | |
| # Map to dataset language | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': prob_val, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS, | |
| 'index': idx_val | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({prob_val:.4f}) {status} [idx:{idx_val}]") | |
| return predictions | |
| except Exception as e: | |
| print(f" β VoxLingua analysis error: {e}") | |
| return None | |
| def analyze_xlsr_fixed(audio_path): | |
| """Fixed XLS-R analysis""" | |
| if xlsr_lid_model is None: | |
| print(f" β XLS-R model not loaded") | |
| return None | |
| try: | |
| raw_out = xlsr_lid_model.classify_file(audio_path) | |
| if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4: | |
| print(f" β Unexpected XLS-R output format") | |
| return None | |
| out_prob, score, index, text_lab = raw_out[:4] | |
| # Get probabilities and top 5 | |
| probs = torch.softmax(out_prob.squeeze(), dim=-1) | |
| k = min(5, len(probs)) | |
| top_probs, top_indices = torch.topk(probs, k) | |
| predictions = [] | |
| for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): | |
| idx_val = idx.item() if hasattr(idx, 'item') else int(idx) | |
| prob_val = prob.item() if hasattr(prob, 'item') else float(prob) | |
| # Get language label | |
| if idx_val < len(text_lab): | |
| lang_label = text_lab[idx_val] | |
| lang_code = str(lang_label).strip().lower() | |
| else: | |
| lang_code = f"xlsr_unknown_{idx_val}" | |
| mapped_lang = map_to_dataset_language(lang_code) | |
| predictions.append({ | |
| 'rank': rank, | |
| 'original': lang_code, | |
| 'mapped': mapped_lang, | |
| 'confidence': prob_val, | |
| 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS | |
| }) | |
| status = "β " if mapped_lang in ALL_SUPPORTED_LANGS else "β" | |
| print(f" #{rank}: {lang_code} β {mapped_lang} ({prob_val:.4f}) {status}") | |
| return predictions | |
| except Exception as e: | |
| print(f" β XLS-R analysis error: {e}") | |
| return None | |
| def generate_detailed_performance_report(results): | |
| """Complete performance analysis report function""" | |
| if not results: | |
| print("β No results to analyze") | |
| return | |
| print("\nπ DETAILED INDEPENDENT MODEL PERFORMANCE REPORT") | |
| print("=" * 70) | |
| # VoxLingua107 Performance Analysis | |
| if results['voxlingua_detailed']: | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() | |
| print(f"\n㪠VOXLINGUA107 PERFORMANCE:") | |
| print("-" * 40) | |
| if len(valid_vox) > 0: | |
| vox_acc = (valid_vox['correct'] == True).mean() | |
| vox_conf_mean = valid_vox['confidence'].mean() | |
| vox_conf_std = valid_vox['confidence'].std() | |
| print(f"Files Analyzed: {len(valid_vox)}") | |
| print(f"Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)") | |
| print(f"Confidence: {vox_conf_mean:.4f} Β± {vox_conf_std:.4f}") | |
| # Per-language breakdown | |
| print(f"\nPer-Language Performance:") | |
| for lang in sorted(valid_vox['gt_iso'].unique()): | |
| lang_data = valid_vox[valid_vox['gt_iso'] == lang] | |
| acc = (lang_data['correct'] == True).mean() | |
| conf_mean = lang_data['confidence'].mean() | |
| count = len(lang_data) | |
| print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}") | |
| else: | |
| print("No valid VoxLingua results") | |
| # XLS-R Performance Analysis | |
| if results['xlsr_detailed']: | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() | |
| print(f"\n㪠XLS-R PERFORMANCE:") | |
| print("-" * 40) | |
| if len(valid_xlsr) > 0: | |
| xlsr_acc = (valid_xlsr['correct'] == True).mean() | |
| xlsr_conf_mean = valid_xlsr['confidence'].mean() | |
| xlsr_conf_std = valid_xlsr['confidence'].std() | |
| print(f"Files Analyzed: {len(valid_xlsr)}") | |
| print(f"Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)") | |
| print(f"Confidence: {xlsr_conf_mean:.4f} Β± {xlsr_conf_std:.4f}") | |
| # Per-language breakdown | |
| print(f"\nPer-Language Performance:") | |
| for lang in sorted(valid_xlsr['gt_iso'].unique()): | |
| lang_data = valid_xlsr[valid_xlsr['gt_iso'] == lang] | |
| acc = (lang_data['correct'] == True).mean() | |
| conf_mean = lang_data['confidence'].mean() | |
| count = len(lang_data) | |
| print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}") | |
| else: | |
| print("No valid XLS-R results") | |
| # Model Comparison | |
| if results['voxlingua_detailed'] and results['xlsr_detailed']: | |
| print(f"\nβοΈ MODEL COMPARISON:") | |
| print("-" * 30) | |
| print(f"VoxLingua107: {vox_acc:.4f} accuracy") | |
| print(f"XLS-R: {xlsr_acc:.4f} accuracy") | |
| # Calculate optimal weights | |
| total_acc = vox_acc + xlsr_acc | |
| if total_acc > 0: | |
| vox_weight = vox_acc / total_acc | |
| xlsr_weight = xlsr_acc / total_acc | |
| print(f"\nπ‘ RECOMMENDED WEIGHTS:") | |
| print(f"VoxLingua107: {vox_weight:.3f} ({vox_weight*100:.1f}%)") | |
| print(f"XLS-R: {xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)") | |
| # Calculate agreement | |
| vox_preds = set(vox_df['pred_iso'].tolist()) | |
| xlsr_preds = set(xlsr_df['pred_iso'].tolist()) | |
| common_preds = vox_preds.intersection(xlsr_preds) | |
| print(f"\nModel Agreement Analysis:") | |
| print(f"Common predictions: {len(common_preds)}") | |
| print(f"VoxLingua unique: {len(vox_preds - xlsr_preds)}") | |
| print(f"XLS-R unique: {len(xlsr_preds - vox_preds)}") | |
| # Save results | |
| timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") | |
| if results['voxlingua_detailed']: | |
| vox_csv = f"voxlingua_fixed_results_{timestamp}.csv" | |
| pd.DataFrame(results['voxlingua_detailed']).to_csv(vox_csv, index=False) | |
| print(f"\nπΎ VoxLingua results: {vox_csv}") | |
| if results['xlsr_detailed']: | |
| xlsr_csv = f"xlsr_fixed_results_{timestamp}.csv" | |
| pd.DataFrame(results['xlsr_detailed']).to_csv(xlsr_csv, index=False) | |
| print(f"πΎ XLS-R results: {xlsr_csv}") | |
| def run_complete_fixed_analysis(audio_files): | |
| """Run complete analysis with all fixes""" | |
| if not audio_files: | |
| print("β No audio files to analyze") | |
| return None | |
| print(f"π Starting COMPLETE FIXED analysis on {len(audio_files)} files...") | |
| print("=" * 70) | |
| results = { | |
| 'voxlingua_detailed': [], | |
| 'xlsr_detailed': [], | |
| 'comparison_data': [] | |
| } | |
| for i, audio_path in enumerate(audio_files, 1): | |
| print(f"\n[{i}/{len(audio_files)}] π΅ {os.path.basename(audio_path)}") | |
| # Extract ground truth | |
| gt_iso = gt_from_filename(audio_path) | |
| print(f" π Ground Truth: {gt_iso or 'Unknown'}") | |
| file_analysis = { | |
| 'file': os.path.basename(audio_path), | |
| 'full_path': audio_path, | |
| 'gt_iso': gt_iso or '', | |
| 'voxlingua': {'available': False}, | |
| 'xlsr': {'available': False} | |
| } | |
| # VoxLingua107 Analysis | |
| print(f" π¬ VoxLingua107 Analysis:") | |
| vox_predictions = analyze_voxlingua_fixed(audio_path) | |
| if vox_predictions and len(vox_predictions) > 0: | |
| top1 = vox_predictions[0] | |
| file_analysis['voxlingua'] = { | |
| 'available': True, | |
| 'top5_predictions': vox_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| } | |
| results['voxlingua_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| # XLS-R Analysis | |
| print(f" π¬ XLS-R Analysis:") | |
| xlsr_predictions = analyze_xlsr_fixed(audio_path) | |
| if xlsr_predictions and len(xlsr_predictions) > 0: | |
| top1 = xlsr_predictions[0] | |
| file_analysis['xlsr'] = { | |
| 'available': True, | |
| 'top5_predictions': xlsr_predictions, | |
| 'top1_prediction': top1['mapped'], | |
| 'top1_confidence': top1['confidence'], | |
| 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, | |
| } | |
| results['xlsr_detailed'].append({ | |
| 'file': os.path.basename(audio_path), | |
| 'gt_iso': gt_iso or '', | |
| 'pred_iso': top1['mapped'], | |
| 'confidence': top1['confidence'], | |
| 'correct': gt_iso == top1['mapped'] if gt_iso else None | |
| }) | |
| results['comparison_data'].append(file_analysis) | |
| print(f" β Analysis complete") | |
| return results | |
| # Run the complete fixed analysis | |
| if 'downloaded_files' in globals() and downloaded_files: | |
| print("π¬ Running COMPLETE FIXED analysis...") | |
| final_analysis_results = run_complete_fixed_analysis(downloaded_files) | |
| if final_analysis_results: | |
| generate_detailed_performance_report(final_analysis_results) | |
| print(f"\nβ COMPLETE FIXED ANALYSIS DONE!") | |
| else: | |
| print("β Analysis failed") | |
| else: | |
| print("β No downloaded files found") | |
| # ============================================================================== | |
| # COMPREHENSIVE EXCEL ANALYSIS WITH ALL DETAILS | |
| # ============================================================================== | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| import os | |
| def create_comprehensive_excel_analysis(results, output_filename=None): | |
| """Create comprehensive Excel analysis with multiple sheets and detailed metrics""" | |
| if not results: | |
| print("β No results to analyze") | |
| return None | |
| # Generate filename if not provided | |
| if not output_filename: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_filename = f"Language_Detection_Comprehensive_Analysis_{timestamp}.xlsx" | |
| print(f"π Creating comprehensive Excel analysis: {output_filename}") | |
| # Create Excel writer | |
| with pd.ExcelWriter(output_filename, engine='openpyxl') as writer: | |
| # ======================================== | |
| # SHEET 1: EXECUTIVE SUMMARY | |
| # ======================================== | |
| print(" π Creating Executive Summary...") | |
| summary_data = [] | |
| # Overall statistics | |
| total_files = len(results['comparison_data']) | |
| vox_available = sum(1 for item in results['comparison_data'] if item['voxlingua']['available']) | |
| xlsr_available = sum(1 for item in results['comparison_data'] if item['xlsr']['available']) | |
| summary_data.extend([ | |
| ['EXECUTIVE SUMMARY', ''], | |
| ['Analysis Date', datetime.now().strftime("%Y-%m-%d %H:%M:%S")], | |
| ['Total Files Analyzed', total_files], | |
| ['VoxLingua107 Available', f"{vox_available} ({vox_available/total_files*100:.1f}%)"], | |
| ['XLS-R Available', f"{xlsr_available} ({xlsr_available/total_files*100:.1f}%)"], | |
| ['', ''], | |
| ]) | |
| # Model performance summary | |
| if results['voxlingua_detailed']: | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() | |
| if len(valid_vox) > 0: | |
| vox_acc = (valid_vox['correct'] == True).mean() | |
| vox_conf = valid_vox['confidence'].mean() | |
| summary_data.extend([ | |
| ['VOXLINGUA107 PERFORMANCE', ''], | |
| ['Accuracy', f"{vox_acc:.4f} ({vox_acc*100:.1f}%)"], | |
| ['Average Confidence', f"{vox_conf:.4f}"], | |
| ['Files with Valid GT', len(valid_vox)], | |
| ['', ''], | |
| ]) | |
| if results['xlsr_detailed']: | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() | |
| if len(valid_xlsr) > 0: | |
| xlsr_acc = (valid_xlsr['correct'] == True).mean() | |
| xlsr_conf = valid_xlsr['confidence'].mean() | |
| summary_data.extend([ | |
| ['XLS-R PERFORMANCE', ''], | |
| ['Accuracy', f"{xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)"], | |
| ['Average Confidence', f"{xlsr_conf:.4f}"], | |
| ['Files with Valid GT', len(valid_xlsr)], | |
| ['', ''], | |
| ]) | |
| # Optimal weights calculation | |
| if results['voxlingua_detailed']: | |
| total_acc = vox_acc + xlsr_acc | |
| if total_acc > 0: | |
| vox_weight = vox_acc / total_acc | |
| xlsr_weight = xlsr_acc / total_acc | |
| summary_data.extend([ | |
| ['RECOMMENDED ENSEMBLE WEIGHTS', ''], | |
| ['VoxLingua107 Weight', f"{vox_weight:.3f} ({vox_weight*100:.1f}%)"], | |
| ['XLS-R Weight', f"{xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)"], | |
| ]) | |
| # Create summary dataframe | |
| summary_df = pd.DataFrame(summary_data, columns=['Metric', 'Value']) | |
| summary_df.to_excel(writer, sheet_name='Executive_Summary', index=False) | |
| # ======================================== | |
| # SHEET 2: VOXLINGUA107 DETAILED RESULTS | |
| # ======================================== | |
| if results['voxlingua_detailed']: | |
| print(" π Creating VoxLingua107 detailed results...") | |
| vox_detailed_df = pd.DataFrame(results['voxlingua_detailed']) | |
| # Add additional analysis columns | |
| vox_detailed_df['accuracy_score'] = vox_detailed_df['correct'].astype(int) | |
| vox_detailed_df['confidence_category'] = pd.cut( | |
| vox_detailed_df['confidence'], | |
| bins=[0, 0.3, 0.6, 0.8, 1.0], | |
| labels=['Low', 'Medium', 'High', 'Very High'] | |
| ) | |
| # Add language family information | |
| def get_language_family(lang): | |
| if lang in INDO_ARYAN_LANGS: | |
| return 'Indo-Aryan' | |
| elif lang in DRAVIDIAN_LANGS: | |
| return 'Dravidian' | |
| elif lang in LOW_RESOURCE_LANGS: | |
| return 'Low-Resource' | |
| else: | |
| return 'Other' | |
| vox_detailed_df['gt_language_family'] = vox_detailed_df['gt_iso'].apply(get_language_family) | |
| vox_detailed_df['pred_language_family'] = vox_detailed_df['pred_iso'].apply(get_language_family) | |
| vox_detailed_df.to_excel(writer, sheet_name='VoxLingua107_Results', index=False) | |
| # ======================================== | |
| # SHEET 3: XLS-R DETAILED RESULTS | |
| # ======================================== | |
| if results['xlsr_detailed']: | |
| print(" π Creating XLS-R detailed results...") | |
| xlsr_detailed_df = pd.DataFrame(results['xlsr_detailed']) | |
| # Add analysis columns | |
| xlsr_detailed_df['accuracy_score'] = xlsr_detailed_df['correct'].astype(int) | |
| xlsr_detailed_df['confidence_category'] = pd.cut( | |
| xlsr_detailed_df['confidence'], | |
| bins=[0, 0.3, 0.6, 0.8, 1.0], | |
| labels=['Low', 'Medium', 'High', 'Very High'] | |
| ) | |
| xlsr_detailed_df['gt_language_family'] = xlsr_detailed_df['gt_iso'].apply(get_language_family) | |
| xlsr_detailed_df['pred_language_family'] = xlsr_detailed_df['pred_iso'].apply(get_language_family) | |
| xlsr_detailed_df.to_excel(writer, sheet_name='XLSR_Results', index=False) | |
| # ======================================== | |
| # SHEET 4: PER-LANGUAGE ACCURACY ANALYSIS | |
| # ======================================== | |
| print(" π Creating per-language accuracy analysis...") | |
| lang_analysis_data = [] | |
| # Get all unique languages from ground truth | |
| all_gt_langs = set() | |
| if results['voxlingua_detailed']: | |
| all_gt_langs.update([r['gt_iso'] for r in results['voxlingua_detailed'] if r['gt_iso']]) | |
| if results['xlsr_detailed']: | |
| all_gt_langs.update([r['gt_iso'] for r in results['xlsr_detailed'] if r['gt_iso']]) | |
| # Language name mapping | |
| LANG_NAMES = { | |
| 'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi', 'or': 'Odia', | |
| 'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese', 'hi': 'Hindi', 'bn': 'Bengali', | |
| 'kok': 'Konkani', 'kn': 'Kannada', 'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu', | |
| 'br': 'Bodo', 'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mni': 'Manipuri', | |
| 'sa': 'Sanskrit' | |
| } | |
| for lang in sorted(all_gt_langs): | |
| lang_name = LANG_NAMES.get(lang, lang.title()) | |
| lang_family = get_language_family(lang) | |
| # VoxLingua stats for this language | |
| vox_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0} | |
| if results['voxlingua_detailed']: | |
| vox_lang_data = [r for r in results['voxlingua_detailed'] if r['gt_iso'] == lang] | |
| if vox_lang_data: | |
| vox_stats['files'] = len(vox_lang_data) | |
| vox_stats['correct'] = sum(1 for r in vox_lang_data if r['correct']) | |
| vox_stats['accuracy'] = vox_stats['correct'] / vox_stats['files'] | |
| vox_stats['avg_confidence'] = np.mean([r['confidence'] for r in vox_lang_data]) | |
| # XLS-R stats for this language | |
| xlsr_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0} | |
| if results['xlsr_detailed']: | |
| xlsr_lang_data = [r for r in results['xlsr_detailed'] if r['gt_iso'] == lang] | |
| if xlsr_lang_data: | |
| xlsr_stats['files'] = len(xlsr_lang_data) | |
| xlsr_stats['correct'] = sum(1 for r in xlsr_lang_data if r['correct']) | |
| xlsr_stats['accuracy'] = xlsr_stats['correct'] / xlsr_stats['files'] | |
| xlsr_stats['avg_confidence'] = np.mean([r['confidence'] for r in xlsr_lang_data]) | |
| lang_analysis_data.append({ | |
| 'Language_Code': lang, | |
| 'Language_Name': lang_name, | |
| 'Language_Family': lang_family, | |
| 'VoxLingua_Files': vox_stats['files'], | |
| 'VoxLingua_Correct': vox_stats['correct'], | |
| 'VoxLingua_Accuracy': f"{vox_stats['accuracy']:.4f}", | |
| 'VoxLingua_Accuracy_Pct': f"{vox_stats['accuracy']*100:.1f}%", | |
| 'VoxLingua_Avg_Confidence': f"{vox_stats['avg_confidence']:.4f}", | |
| 'XLSR_Files': xlsr_stats['files'], | |
| 'XLSR_Correct': xlsr_stats['correct'], | |
| 'XLSR_Accuracy': f"{xlsr_stats['accuracy']:.4f}", | |
| 'XLSR_Accuracy_Pct': f"{xlsr_stats['accuracy']*100:.1f}%", | |
| 'XLSR_Avg_Confidence': f"{xlsr_stats['avg_confidence']:.4f}", | |
| 'Better_Model': 'VoxLingua' if vox_stats['accuracy'] > xlsr_stats['accuracy'] else 'XLS-R' if xlsr_stats['accuracy'] > vox_stats['accuracy'] else 'Tie' | |
| }) | |
| lang_analysis_df = pd.DataFrame(lang_analysis_data) | |
| lang_analysis_df.to_excel(writer, sheet_name='Per_Language_Analysis', index=False) | |
| # ======================================== | |
| # SHEET 5: CONFUSION MATRIX - VOXLINGUA | |
| # ======================================== | |
| if results['voxlingua_detailed']: | |
| print(" π Creating VoxLingua confusion matrix...") | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() | |
| if len(valid_vox) > 0: | |
| # Create confusion matrix | |
| confusion_data = [] | |
| for gt_lang in sorted(valid_vox['gt_iso'].unique()): | |
| gt_data = valid_vox[valid_vox['gt_iso'] == gt_lang] | |
| row_data = {'Ground_Truth': gt_lang} | |
| for pred_lang in sorted(valid_vox['pred_iso'].unique()): | |
| count = len(gt_data[gt_data['pred_iso'] == pred_lang]) | |
| row_data[f'Predicted_{pred_lang}'] = count | |
| confusion_data.append(row_data) | |
| confusion_df = pd.DataFrame(confusion_data).fillna(0) | |
| confusion_df.to_excel(writer, sheet_name='VoxLingua_Confusion_Matrix', index=False) | |
| # ======================================== | |
| # SHEET 6: CONFUSION MATRIX - XLS-R | |
| # ======================================== | |
| if results['xlsr_detailed']: | |
| print(" π Creating XLS-R confusion matrix...") | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() | |
| if len(valid_xlsr) > 0: | |
| confusion_data = [] | |
| for gt_lang in sorted(valid_xlsr['gt_iso'].unique()): | |
| gt_data = valid_xlsr[valid_xlsr['gt_iso'] == gt_lang] | |
| row_data = {'Ground_Truth': gt_lang} | |
| for pred_lang in sorted(valid_xlsr['pred_iso'].unique()): | |
| count = len(gt_data[gt_data['pred_iso'] == pred_lang]) | |
| row_data[f'Predicted_{pred_lang}'] = count | |
| confusion_data.append(row_data) | |
| confusion_df = pd.DataFrame(confusion_data).fillna(0) | |
| confusion_df.to_excel(writer, sheet_name='XLSR_Confusion_Matrix', index=False) | |
| # ======================================== | |
| # SHEET 7: CONFIDENCE ANALYSIS | |
| # ======================================== | |
| print(" π Creating confidence analysis...") | |
| confidence_analysis = [] | |
| # VoxLingua confidence analysis | |
| if results['voxlingua_detailed']: | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() | |
| if len(valid_vox) > 0: | |
| for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]: | |
| range_data = valid_vox[ | |
| (valid_vox['confidence'] >= conf_range[0]) & | |
| (valid_vox['confidence'] < conf_range[1]) | |
| ] | |
| if len(range_data) > 0: | |
| accuracy = (range_data['correct'] == True).mean() | |
| confidence_analysis.append({ | |
| 'Model': 'VoxLingua107', | |
| 'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}", | |
| 'Files': len(range_data), | |
| 'Accuracy': f"{accuracy:.4f}", | |
| 'Accuracy_Pct': f"{accuracy*100:.1f}%", | |
| 'Avg_Confidence': f"{range_data['confidence'].mean():.4f}" | |
| }) | |
| # XLS-R confidence analysis | |
| if results['xlsr_detailed']: | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() | |
| if len(valid_xlsr) > 0: | |
| for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]: | |
| range_data = valid_xlsr[ | |
| (valid_xlsr['confidence'] >= conf_range[0]) & | |
| (valid_xlsr['confidence'] < conf_range[1]) | |
| ] | |
| if len(range_data) > 0: | |
| accuracy = (range_data['correct'] == True).mean() | |
| confidence_analysis.append({ | |
| 'Model': 'XLS-R', | |
| 'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}", | |
| 'Files': len(range_data), | |
| 'Accuracy': f"{accuracy:.4f}", | |
| 'Accuracy_Pct': f"{accuracy*100:.1f}%", | |
| 'Avg_Confidence': f"{range_data['confidence'].mean():.4f}" | |
| }) | |
| confidence_df = pd.DataFrame(confidence_analysis) | |
| confidence_df.to_excel(writer, sheet_name='Confidence_Analysis', index=False) | |
| # ======================================== | |
| # SHEET 8: ERROR ANALYSIS | |
| # ======================================== | |
| print(" π Creating error analysis...") | |
| error_analysis = [] | |
| # VoxLingua errors | |
| if results['voxlingua_detailed']: | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| vox_errors = vox_df[vox_df['correct'] == False].copy() | |
| for _, error in vox_errors.iterrows(): | |
| error_analysis.append({ | |
| 'Model': 'VoxLingua107', | |
| 'File': error['file'], | |
| 'Ground_Truth': error['gt_iso'], | |
| 'Predicted': error['pred_iso'], | |
| 'Confidence': f"{error['confidence']:.4f}", | |
| 'GT_Language_Family': get_language_family(error['gt_iso']), | |
| 'Pred_Language_Family': get_language_family(error['pred_iso']), | |
| 'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso']) | |
| }) | |
| # XLS-R errors | |
| if results['xlsr_detailed']: | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| xlsr_errors = xlsr_df[xlsr_df['correct'] == False].copy() | |
| for _, error in xlsr_errors.iterrows(): | |
| error_analysis.append({ | |
| 'Model': 'XLS-R', | |
| 'File': error['file'], | |
| 'Ground_Truth': error['gt_iso'], | |
| 'Predicted': error['pred_iso'], | |
| 'Confidence': f"{error['confidence']:.4f}", | |
| 'GT_Language_Family': get_language_family(error['gt_iso']), | |
| 'Pred_Language_Family': get_language_family(error['pred_iso']), | |
| 'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso']) | |
| }) | |
| error_df = pd.DataFrame(error_analysis) | |
| error_df.to_excel(writer, sheet_name='Error_Analysis', index=False) | |
| # ======================================== | |
| # SHEET 9: LANGUAGE FAMILY PERFORMANCE | |
| # ======================================== | |
| print(" π Creating language family performance...") | |
| family_performance = [] | |
| families = ['Indo-Aryan', 'Dravidian', 'Low-Resource', 'Other'] | |
| for family in families: | |
| # VoxLingua performance for this family | |
| if results['voxlingua_detailed']: | |
| vox_df = pd.DataFrame(results['voxlingua_detailed']) | |
| family_data = vox_df[vox_df['gt_iso'].apply(lambda x: get_language_family(x) == family)] | |
| if len(family_data) > 0: | |
| vox_acc = (family_data['correct'] == True).mean() | |
| vox_conf = family_data['confidence'].mean() | |
| vox_files = len(family_data) | |
| else: | |
| vox_acc = vox_conf = vox_files = 0 | |
| else: | |
| vox_acc = vox_conf = vox_files = 0 | |
| # XLS-R performance for this family | |
| if results['xlsr_detailed']: | |
| xlsr_df = pd.DataFrame(results['xlsr_detailed']) | |
| family_data = xlsr_df[xlsr_df['gt_iso'].apply(lambda x: get_language_family(x) == family)] | |
| if len(family_data) > 0: | |
| xlsr_acc = (family_data['correct'] == True).mean() | |
| xlsr_conf = family_data['confidence'].mean() | |
| xlsr_files = len(family_data) | |
| else: | |
| xlsr_acc = xlsr_conf = xlsr_files = 0 | |
| else: | |
| xlsr_acc = xlsr_conf = xlsr_files = 0 | |
| family_performance.append({ | |
| 'Language_Family': family, | |
| 'VoxLingua_Files': vox_files, | |
| 'VoxLingua_Accuracy': f"{vox_acc:.4f}", | |
| 'VoxLingua_Accuracy_Pct': f"{vox_acc*100:.1f}%", | |
| 'VoxLingua_Avg_Confidence': f"{vox_conf:.4f}", | |
| 'XLSR_Files': xlsr_files, | |
| 'XLSR_Accuracy': f"{xlsr_acc:.4f}", | |
| 'XLSR_Accuracy_Pct': f"{xlsr_acc*100:.1f}%", | |
| 'XLSR_Avg_Confidence': f"{xlsr_conf:.4f}", | |
| 'Better_Model': 'VoxLingua' if vox_acc > xlsr_acc else 'XLS-R' if xlsr_acc > vox_acc else 'Tie' | |
| }) | |
| family_df = pd.DataFrame(family_performance) | |
| family_df.to_excel(writer, sheet_name='Language_Family_Performance', index=False) | |
| # ======================================== | |
| # SHEET 10: TOP-5 PREDICTIONS (SAMPLE) | |
| # ======================================== | |
| print(" π Creating Top-5 predictions sample...") | |
| top5_sample = [] | |
| # Sample top-5 predictions from comparison data | |
| sample_files = results['comparison_data'][:20] # First 20 files as sample | |
| for file_data in sample_files: | |
| file_name = file_data['file'] | |
| gt_lang = file_data['gt_iso'] | |
| # VoxLingua Top-5 | |
| if file_data['voxlingua']['available'] and 'top5_predictions' in file_data['voxlingua']: | |
| for pred in file_data['voxlingua']['top5_predictions']: | |
| top5_sample.append({ | |
| 'Model': 'VoxLingua107', | |
| 'File': file_name, | |
| 'Ground_Truth': gt_lang, | |
| 'Rank': pred['rank'], | |
| 'Predicted_Language': pred['mapped'], | |
| 'Original_Output': pred['original'], | |
| 'Confidence': f"{pred['confidence']:.4f}", | |
| 'In_Dataset': pred['in_dataset'], | |
| 'Correct': gt_lang == pred['mapped'] | |
| }) | |
| # XLS-R Top-5 | |
| if file_data['xlsr']['available'] and 'top5_predictions' in file_data['xlsr']: | |
| for pred in file_data['xlsr']['top5_predictions']: | |
| top5_sample.append({ | |
| 'Model': 'XLS-R', | |
| 'File': file_name, | |
| 'Ground_Truth': gt_lang, | |
| 'Rank': pred['rank'], | |
| 'Predicted_Language': pred['mapped'], | |
| 'Original_Output': pred['original'], | |
| 'Confidence': f"{pred['confidence']:.4f}", | |
| 'In_Dataset': pred['in_dataset'], | |
| 'Correct': gt_lang == pred['mapped'] | |
| }) | |
| top5_df = pd.DataFrame(top5_sample) | |
| top5_df.to_excel(writer, sheet_name='Top5_Predictions_Sample', index=False) | |
| print(f"β Comprehensive Excel analysis created: {output_filename}") | |
| # Try to download the file | |
| try: | |
| from google.colab import files | |
| print(f"π₯ File downloaded successfully!") | |
| except: | |
| print(f"π File saved locally: {output_filename}") | |
| return output_filename | |
| # Run the comprehensive Excel analysis | |
| if 'final_analysis_results' in globals() and final_analysis_results: | |
| excel_filename = create_comprehensive_excel_analysis( | |
| final_analysis_results, | |
| "Language_Detection_Comprehensive_Analysis.xlsx" | |
| ) | |
| print(f"\nπ COMPREHENSIVE EXCEL ANALYSIS COMPLETE!") | |
| print(f"π File: {excel_filename}") | |
| # Print summary of what was created | |
| print(f"\nπ Excel Contains 10 Sheets:") | |
| print(f" 1. Executive_Summary - Key metrics and recommendations") | |
| print(f" 2. VoxLingua107_Results - Detailed VoxLingua results") | |
| print(f" 3. XLSR_Results - Detailed XLS-R results") | |
| print(f" 4. Per_Language_Analysis - Accuracy by language") | |
| print(f" 5. VoxLingua_Confusion_Matrix - VoxLingua confusion matrix") | |
| print(f" 6. XLSR_Confusion_Matrix - XLS-R confusion matrix") | |
| print(f" 7. Confidence_Analysis - Performance by confidence ranges") | |
| print(f" 8. Error_Analysis - Detailed error breakdown") | |
| print(f" 9. Language_Family_Performance - Performance by language family") | |
| print(f" 10. Top5_Predictions_Sample - Sample of top-5 predictions") | |
| else: | |
| print("β No analysis results found. Please run the analysis first.") | |