ASR_New

Sleeping

App Files Files Community

Noumida commited on Aug 11

Commit

bafb16f

verified ·

1 Parent(s): 797ee59

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -99

app.py CHANGED Viewed

@@ -1,136 +1,130 @@
-from __future__ import annotations
 import torch
 import torchaudio
 import gradio as gr
 import spaces
-from transformers import AutoModel
 import re
-DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT) with Auto Language ID"
-# --- Data Dictionaries ---
-# Dictionary for character sets, now with improved formatting for readability.
-LANGUAGE_CHARSETS = {
-    "as": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'ৰ', 'ল', 'ৱ', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্']),
-    "bn": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ড়', 'ঢ়', 'য়']),
-    "br": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ূ', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "doi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ج', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्']),
-    "gu": set(['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ', 'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ', '઼', 'ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ે', 'ૈ', 'ો', 'ૌ', '્']),
-    "hi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "kn": set(['ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಲ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 'ಳ', 'ಱ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್']),
-    "ks": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
-    "kok": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "mai": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "ml": set(['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', '്']),
-    "mni": set(['ꯑ', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '꯺', '꯻', '꯼', '꯽', '꯾', '꯿', 'ꯀ', 'ꯂ', 'ꯃ', 'ꯄ', 'ꯅ', 'ꯆ', 'ꯇ', 'ꯈ', 'ꯉ', 'ꯊ', 'ꯋ', 'ꯌ', 'ꯍ', 'ꯎ', 'ꯏ', 'ꯐ', 'ꯑ']),
-    "mr": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्', 'ळ']),
-    "ne": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "or": set(['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଏ', 'ଐ', 'ଓ', 'ଔ', 'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ', 'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ବ', 'ଶ', 'ଷ', 'ସ', 'ହ', 'କ୍ଷ', 'ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'େ', 'ୈ', 'ୋ', 'ୌ', '୍']),
-    "pa": set(['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ', 'ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ', 'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ', 'ਖ਼', 'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼', 'ਲ਼', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ', '੍']),
-    "sa": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
-    "sat": set(['ᱚ', 'ᱛ', 'ᱚ', 'ᱜ', 'ᱚ', 'ᱝ', 'ᱞ', 'ᱟ', 'ᱠ', 'ᱥ', 'ᱮ', 'ᱫ', 'ఇ', 'ᱤ', 'ᱩ', 'ੂ', 'େ', 'ୈ', 'ᱪ', 'ᱡ', 'ᱭ']),
-    "sd": set(['ا', 'آ', 'ب', 'ڀ', 'ت', 'ٽ', 'ث', 'پ', 'ج', 'ڄ', 'جھ', 'چ', 'ح', 'خ', 'ڌ', 'د', 'ڏ', 'ڊ', 'ذ', 'ر', 'ز', 'ڙ', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ڦ', 'ق', 'ڪ', 'ک', 'گ', 'ڳ', 'ڱ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ي']),
-    "ta": set(['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ', 'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ஷ', 'ஸ', 'ஹ']),
-    "te": set(['అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్']),
-    "ur": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
-}
-# Dictionary for common words of each language
-LANGUAGE_COMMON_WORDS = {
-    "as": set(["আৰু", "হয়", "এটা", "কৰা", "ওপৰত", "যে"]),
-    "bn": set(["এবং", "একটি", "করুন", "জন্য", "সঙ্গে", "হচ্ছে"]),
-    "br": set(["आरो", "एसे", "मोनसे", "माव", "दं", "जा"]),
-    "doi": set(["ते", "दे", "ऐ", "इक", "ओह्", "कर"]),
-    "gu": set(["અને", "એક", "માટે", "છે", "સાથે", "કરવું"]),
-    "hi": set(["और", "है", "एक", "में", "के", "लिए"]),
-    "kn": set(["ಮತ್ತು", "ಒಂದು", "ಹೇಗೆ", "ನಾನು", "ಇದೆ", "ಆ"]),
-    "ks": set([" تہٕ", "چھُ", "اکھ", "منز", "کیتھ", "छु", "छ"]),
-    "kok": set(["आनी", "एक", "कर", "खातीर", "कडेन", "आसा"]),
-    "mai": set(["आ", "एक", "हम", "अछि", "क'", "छै"]),
-    "ml": set(["ഒരു", "കൂടാതെ", "എങ്ങനെ", "ഞാൻ", "ഇത്", "ആണ്"]),
-    "mni": set(["ꯗꯥ", "ꯑꯃꯥ", "ꯀꯔꯤ", "ꯑꯩꯅꯥ", "ꯑꯁꯤ", "ꯂꯩ"]),
-    "mr": set(["आणि", "एक", "आहे", "मी", "तू", "जे"]),
-    "ne": set(["र", "एक", "हो", "म", "तिमी", "छ"]),
-    "or": set(["ଏବଂ", "ଗୋଟିଏ", "କରନ୍ତୁ", "ପାଇଁ", "ସହିତ", "ଅଛି"]),
-    "pa": set(["ਅਤੇ", "ਇੱਕ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ", "ਦਾ"]),
-    "sa": set(["च", "एकः", "अस्ति", "अहम्", "त्वम्", "सः"]),
-    "sat": set(["ᱟᱨ", "ᱫᱚ", "ᱢᱤᱫ", "ಒಂದು", "ಮತ್ತು", "ক"]),
-    "sd": set(["۽", "هڪ", "آهي", "۾", "کي", "جو"]),
-    "ta": set(["மற்றும்", "ஒரு", "வேண்டும்", "நான்", "இது", "ஆகும்"]),
-    "te": set(["మరియు", "ఒక", "కావాలి", "నేను", "ఇది", "ఉంది"]),
-    "ur": set(["اور", "ہے", "ایک", "میں", "کے", "لیے"]),
 }
-# Mapping from language code to its full name for display purposes
-LANGUAGE_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- Model Loading ---
-print("Loading IndicConformer model...")
-model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
-model.eval()
-print("✅ Model loaded successfully.")
-# --- Core Logic ---
-def identify_language(text: str) -> str | None:
-    """Identifies the language of a given text based on character sets and common words."""
-    if not text.strip():
-        return None
-    scores = {lang: 0 for lang in LANGUAGE_CHARSETS.keys()}
-    text_chars = set(text)
-    text_words = set(re.split(r'[\s,.:;!?]+', text))
-    for lang_code in scores.keys():
-        char_score = len(text_chars.intersection(LANGUAGE_CHARSETS.get(lang_code, set())))
-        word_score = len(text_words.intersection(LANGUAGE_COMMON_WORDS.get(lang_code, set())))
-        scores[lang_code] = (char_score * 2) + word_score
-    max_score = max(scores.values())
-    if max_score < 3:
-        return None
-    identified_code = max(scores, key=scores.get)
-    return identified_code
 @spaces.GPU
-def transcribe_and_identify(audio_path):
     if not audio_path:
         return "Please provide an audio file.", "", ""
     try:
         waveform, sr = torchaudio.load(audio_path)
-        waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
-        waveform = torchaudio.functional.resample(waveform, sr, 16000).to(device)
     except Exception as e:
         return f"Error loading audio: {e}", "", ""
     try:
         with torch.no_grad():
-            initial_transcription = model(waveform, "hi", "ctc")
-        identified_lang_code = identify_language(initial_transcription)
-        if not identified_lang_code:
-            detected_lang_str = "Language not detected or unsupported."
-            return detected_lang_str, initial_transcription + " (pivot)", "Could not perform final transcription."
-        detected_lang_str = f"Detected Language: {LANGUAGE_CODE_TO_NAME.get(identified_lang_code, 'Unknown')}"
         with torch.no_grad():
-            transcription_ctc = model(waveform, identified_lang_code, "ctc")
-            transcription_rnnt = model(waveform, identified_lang_code, "rnnt")
     except Exception as e:
-        return f"Error during transcription: {str(e)}", "", ""
     return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
-# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
-    gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription using both CTC and RNNT decoding.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -145,7 +139,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             ctc_output = gr.Textbox(lines=3, label="CTC Output")
     transcribe_btn.click(
-        fn=transcribe_and_identify,
         inputs=[audio],
         outputs=[detected_lang_output, ctc_output, rnnt_output],
         api_name="transcribe"

+from __future__ import annotationsfrom __future__ import annotations
 import torch
 import torchaudio
 import gradio as gr
 import spaces
+from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
 import re
+DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- ASR Model (The one we used before) ---
+print("Loading ASR model (IndicConformer)...")
+asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
+asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
+asr_model.eval()
+print("✅ ASR Model loaded.")
+# --- Language Identification (LID) Model ---
+print("\nLoading Language ID model (MMS-LID)...")
+lid_model_id = "facebook/mms-lid"
+lid_processor = AutoProcessor.from_pretrained(lid_model_id)
+lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
+lid_model.eval()
+print("✅ Language ID Model loaded.")
+# --- Language Mappings ---
+# Maps the LID model's output code to the ASR model's code
+LID_TO_ASR_LANG_MAP = {
+    "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
+    "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
+    "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
+    "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
+    "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
+    "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
 }
+# Maps the ASR model's code back to a full name for display
+ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
+import torch
+import torchaudio
+import gradio as gr
+import spaces
+from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
+import re
+DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- ASR Model (The one we used before) ---
+print("Loading ASR model (IndicConformer)...")
+asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
+asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
+asr_model.eval()
+print("✅ ASR Model loaded.")
+# --- Language Identification (LID) Model ---
+print("\nLoading Language ID model (MMS-LID)...")
+lid_model_id = "facebook/mms-lid"
+lid_processor = AutoProcessor.from_pretrained(lid_model_id)
+lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
+lid_model.eval()
+print("✅ Language ID Model loaded.")
+# --- Language Mappings ---
+# Maps the LID model's output code to the ASR model's code
+LID_TO_ASR_LANG_MAP = {
+    "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
+    "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
+    "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
+    "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
+    "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
+    "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
+}
+# Maps the ASR model's code back to a full name for display
+ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 @spaces.GPU
+def transcribe_audio_with_lid(audio_path):
     if not audio_path:
         return "Please provide an audio file.", "", ""
     try:
+        # Load and preprocess audio
         waveform, sr = torchaudio.load(audio_path)
+        # Resample for both models
+        waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
     except Exception as e:
         return f"Error loading audio: {e}", "", ""
     try:
+        # 1. --- Language Identification ---
+        inputs = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
         with torch.no_grad():
+            outputs = lid_model(**inputs)
+        # Get the top predicted language ID from the LID model
+        predicted_lid_id = outputs.logits.argmax(-1).item()
+        # The model.config.id2label gives us the language code like "hin_Deva"
+        detected_lid_code = lid_model.config.id2label[predicted_lid_id]
+        # 2. --- Map to ASR Language Code ---
+        asr_lang_code = LID_TO_ASR_LANG_MAP.get(detected_lid_code)
+        if not asr_lang_code:
+            detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
+            return detected_lang_str, "N/A", "N/A"
+        detected_lang_str = f"Detected Language: {ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')}"
+        # 3. --- Transcription using the detected language ---
         with torch.no_grad():
+            # Use the ASR model with the correctly identified language code
+            transcription_ctc = asr_model(waveform_16k.to(device), asr_lang_code, "ctc")
+            transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
     except Exception as e:
+        return f"Error during processing: {str(e)}", "", ""
     return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
+# Gradio UI (no major changes needed here)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
+    gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")
     with gr.Row():
         with gr.Column(scale=1):
             ctc_output = gr.Textbox(lines=3, label="CTC Output")
     transcribe_btn.click(
+        fn=transcribe_audio_with_lid,
         inputs=[audio],
         outputs=[detected_lang_output, ctc_output, rnnt_output],
         api_name="transcribe"