ASR_New

Sleeping

App Files Files Community

Noumida commited on Aug 11, 2025

Commit

de7eff6

verified ·

1 Parent(s): 6a11e69

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -35

app.py CHANGED Viewed

@@ -1,65 +1,157 @@
 from __future__ import annotations
 import torch
 import torchaudio
 import gradio as gr
 import spaces
 from transformers import AutoModel
-DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT)"
-LANGUAGE_NAME_TO_CODE = {
-    "Assamese": "as", "Bengali": "bn", "Bodo": "br", "Dogri": "doi",
-    "Gujarati": "gu", "Hindi": "hi", "Kannada": "kn", "Kashmiri": "ks",
-    "Konkani": "kok", "Maithili": "mai", "Malayalam": "ml", "Manipuri": "mni",
-    "Marathi": "mr", "Nepali": "ne", "Odia": "or", "Punjabi": "pa",
-    "Sanskrit": "sa", "Santali": "sat", "Sindhi": "sd", "Tamil": "ta",
-    "Telugu": "te", "Urdu": "ur"
 }
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Indic Conformer model (assumes custom forward handles decoding strategy)
 model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
 model.eval()
 @spaces.GPU
-def transcribe_ctc_and_rnnt(audio_path, language_name):
-    lang_code = LANGUAGE_NAME_TO_CODE[language_name]
     # Load and preprocess audio
-    waveform, sr = torchaudio.load(audio_path)
-    waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
-    waveform = torchaudio.functional.resample(waveform, sr, 16000).to(device)
     try:
-        # Assume model's forward method takes waveform, language code, and decoding type
         with torch.no_grad():
-            transcription_ctc = model(waveform, lang_code, "ctc")
-            transcription_rnnt = model(waveform, lang_code, "rnnt")
     except Exception as e:
-        return f"Error: {str(e)}", ""
-    return transcription_ctc.strip(), transcription_rnnt.strip()
 # Gradio UI
-with gr.Blocks() as demo:
     gr.Markdown(f"## {DESCRIPTION}")
     with gr.Row():
-        with gr.Column():
             audio = gr.Audio(label="Upload or Record Audio", type="filepath")
-            lang = gr.Dropdown(
-                label="Select Language",
-                choices=list(LANGUAGE_NAME_TO_CODE.keys()),
-                value="Hindi"
-            )
-            transcribe_btn = gr.Button("Transcribe (CTC + RNNT)")
-        with gr.Column():
-            gr.Markdown("### CTC Transcription")
-            ctc_output = gr.Textbox(lines=3)
-            gr.Markdown("### RNNT Transcription")
-            rnnt_output = gr.Textbox(lines=3)
-    transcribe_btn.click(fn=transcribe_ctc_and_rnnt, inputs=[audio, lang], outputs=[ctc_output, rnnt_output], api_name="transcribe")
 if __name__ == "__main__":
-    demo.queue().launch()

+Of course. I'll update the code to perform automatic language identification based on the transcription's characters and common words before providing the final, high-quality transcription.
+This new version will:
+1.  **Remove the language dropdown**, as the language will be detected automatically.
+2.  Perform a quick, initial transcription using Hindi as a "pivot" language.
+3.  Analyze the resulting text against a **custom dictionary** of unique characters and common words for all 22 supported languages.
+4.  Once the language is identified, it will perform the final, more accurate transcription using the detected language code.
+-----
+### **Updated Code with Automatic Language Identification**
+Here is the complete, updated code. You can replace your existing script with this one.
+```python
 from __future__ import annotations
 import torch
 import torchaudio
 import gradio as gr
 import spaces
 from transformers import AutoModel
+import re
+DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT) with Auto Language ID"
+# --- Language Identification Data ---
+# A dictionary containing unique character sets and common words for each language.
+# This data is used by our custom language identification logic.
+LANGUAGE_DATA = {
+    "as": {"chars": set("অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযৰলৱশষসহৎংঃঽািীুূৃেৈোৌ্"), "words": set(["আৰু", "হয়", "এটা", "কৰি", "ওপৰত"])},
+    "bn": {"chars": set("অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহৎংঃঽািীুূৃেৈোৌ্ড়ঢ়য়"), "words": set(["এবং", "একটি", "করুন", "জন্য", "সঙ্গে"])},
+    "br": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह़ािीुূृेैोौ्"), "words": set(["आरो", "एसे", "मोनसे", "माव", "आव"])},
+    "doi": {"chars": set("अआइईउऊएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूेैोौ्"), "words": set(["ते", "दे", "ऐ", "इक", "ओह्"])},
+    "gu": {"chars": set("અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહ઼ાિીુૂૃેૈોૌ્"), "words": set(["અને", "એક", "માટે", "છે", "સાથે"])},
+    "hi": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्"), "words": set(["और", "है", "एक", "में", "के"])},
+    "kn": {"chars": set("ಅಆಇಈಉಊಋಎಏಐಒಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲವಶಷಸಹಳಱಾಿೀುೂೃೆೇೈೊೋೌ್"), "words": set(["ಮತ್ತು", "ಒಂದು", "ಹೇಗೆ", "ನಾನು", "ಇದೆ"])},
+    "ks": {"chars": set("اآبپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوھءییے"), "words": set([" تہٕ", "چھُ", "اکھ", "منز", "کیتھ"])},
+    "kok": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्"), "words": set(["आनी", "एक", "कर", "खातीर", "कडेन"])},
+    "mai": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्"), "words": set(["आ", "एक", "हम", "अछि", "क"])},
+    "ml": {"chars": set("അആഇഈഉഊഋഎഏഐഒഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലവശഷസഹളഴറാിീുൂൃെേൈൊോൌ്"), "words": set(["ഒരു", "மற்றும்", "എങ്ങനെ", "ഞാൻ", "ഇതു"])},
+    "mni": {"chars": set("ꯑ꯲꯳꯴꯵꯶꯷꯸꯹꯺꯻꯼꯽꯾꯿ꯀꯂꯃꯄꯅꯆꯇꯈꯉꯊꯋꯌꯍꯎꯏꯐꯑ"), "words": set(["ꯗꯥ", "ꯑꯃꯥ", "ꯀꯔꯤ", "ꯑꯩꯅꯥ", "ꯑꯁꯤ"])},
+    "mr": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझ��टठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्ळ"), "words": set(["आणि", "एक", "आहे", "मी", "तू"])},
+    "ne": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्"), "words": set(["र", "एक", "हो", "म", "तिमी"])},
+    "or": {"chars": set("ଅଆଇଈଉଊଋଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମଯରଲଳବଶଷସହକ୍ଷାିୀୁୂୃେୈୋୌ୍"), "words": set(["ଏବଂ", "ଗୋଟିଏ", "କରନ୍ତୁ", "ପାଇଁ", "ସହିତ"])},
+    "pa": {"chars": set("ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਵਸ਼ਸਹਖ਼ਗ਼ਜ਼ੜਫ਼ਲ਼ਿੀੁੂੇੈੋੌ੍"), "words": set(["ਅਤੇ", "ਇੱਕ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ"])},
+    "sa": {"chars": set("अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहािीुूृेैोौ्"), "words": set(["च", "एकः", "अस्ति", "अहम्", "त्वम्"])},
+    "sat": {"chars": set("ᱚᱟᱤᱥᱩᱨᱮႅᱳ鄴ᱠᱜᱝᱪᱡᱧଟଡᱬᱛᱫ Narayan pur pᱷᱵᱶᱷ"), "words": set(["ᱟᱨ", "ᱫᱚ", "হয়", "ఒకటి", "మరియు"])},
+    "sd": {"chars": set("اآبڀتٽثپجڄ جھچحخڌدڏڊذرزڙژسشصضطظعغفڦقڪکگڳڱلمنوھ ءي"), "words": set(["۽", "هڪ", "آهي", "۾", "کي"])},
+    "ta": {"chars": set("அஆஇஈஉஊஎஏஐஒஓஔகஙசஞடணதநனபமயரலவழளஷஸஹாిീுூெேைொோௌ்"), "words": set(["மற்றும்", "ஒரு", "வேண்டும்", "நான்", "இது"])},
+    "te": {"chars": set("అఆఇఈఉఊఋఎఏఐఒఓఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలవశషసహళక్షఱాిీుూృెేైొోౌ్"), "words": set(["మరియు", "ఒక", "வேண்டும்", "నేను", "ఇది"])},
+    "ur": {"chars": set("اآبپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوھءییے"), "words": set(["اور", "ہے", "ایک", "میں", "کے"])},
 }
+LANGUAGE_CODE_TO_NAME = {v: k for k, v in LANGUAGE_DATA.items()}
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load Indic Conformer model
+print("Loading IndicConformer model...")
 model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
 model.eval()
+print("✅ Model loaded successfully.")
+def identify_language(text: str) -> str | None:
+    """Identifies the language of a given text based on character sets and common words."""
+    if not text.strip():
+        return None
+    scores = {lang: 0 for lang in LANGUAGE_DATA}
+    text_chars = set(text)
+    # Use regex to split words, handling various scripts
+    text_words = set(re.split(r'[\s,.:;!?]+', text))
+    for lang_code, data in LANGUAGE_DATA.items():
+        char_score = len(text_chars.intersection(data["chars"]))
+        word_score = len(text_words.intersection(data["words"]))
+        # Give more weight to character matches as they are a stronger signal of the script
+        scores[lang_code] = (char_score * 2) + word_score
+    # Identify the language with the highest score
+    # Return None if the highest score is very low, indicating a poor match
+    max_score = max(scores.values())
+    if max_score < 3: # Heuristic threshold to prevent misidentification on noise
+        return None
+    identified_code = max(scores, key=scores.get)
+    return identified_code
 @spaces.GPU
+def transcribe_and_identify(audio_path):
+    if not audio_path:
+        return "Please provide an audio file.", "", ""
     # Load and preprocess audio
+    try:
+        waveform, sr = torchaudio.load(audio_path)
+        waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
+        waveform = torchaudio.functional.resample(waveform, sr, 16000).to(device)
+    except Exception as e:
+        return f"Error loading audio: {e}", "", ""
     try:
+        # 1. Perform a fast, initial transcription using a pivot language (Hindi)
         with torch.no_grad():
+            initial_transcription = model(waveform, "hi", "ctc")
+        # 2. Identify the language from the initial transcription
+        identified_lang_code = identify_language(initial_transcription)
+        if not identified_lang_code:
+            detected_lang_str = "Language not detected or unsupported."
+            return detected_lang_str, initial_transcription + " (pivot)", "Could not perform final transcription."
+        detected_lang_str = f"Detected Language: {LANGUAGE_CODE_TO_NAME.get(identified_lang_code, 'Unknown')}"
+        # 3. Perform the final, high-quality transcription using the identified language
+        with torch.no_grad():
+            transcription_ctc = model(waveform, identified_lang_code, "ctc")
+            transcription_rnnt = model(waveform, identified_lang_code, "rnnt")
     except Exception as e:
+        return f"Error during transcription: {str(e)}", "", ""
+    return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
 # Gradio UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"## {DESCRIPTION}")
+    gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription using both CTC and RNNT decoding.")
     with gr.Row():
+        with gr.Column(scale=1):
             audio = gr.Audio(label="Upload or Record Audio", type="filepath")
+            transcribe_btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column(scale=2):
+            detected_lang_output = gr.Label(label="Language Detection Result")
+            gr.Markdown("### RNNT Transcription (More Accurate)")
+            rnnt_output = gr.Textbox(lines=3, label="RNNT Output")
+            gr.Markdown("### CTC Transcription (Faster)")
+            ctc_output = gr.Textbox(lines=3, label="CTC Output")
+    transcribe_btn.click(
+        fn=transcribe_and_identify,
+        inputs=[audio],
+        outputs=[detected_lang_output, ctc_output, rnnt_output],
+        api_name="transcribe"
+    )
 if __name__ == "__main__":
+    demo.queue().launch(share=True)
+```