Noumida commited on
Commit
bafb16f
·
verified ·
1 Parent(s): 797ee59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -99
app.py CHANGED
@@ -1,136 +1,130 @@
1
- from __future__ import annotations
2
  import torch
3
  import torchaudio
4
  import gradio as gr
5
  import spaces
6
- from transformers import AutoModel
7
  import re
8
 
9
- DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT) with Auto Language ID"
10
-
11
- # --- Data Dictionaries ---
12
-
13
- # Dictionary for character sets, now with improved formatting for readability.
14
- LANGUAGE_CHARSETS = {
15
- "as": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'ৰ', 'ল', 'ৱ', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্']),
16
- "bn": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ড়', 'ঢ়', 'য়']),
17
- "br": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ূ', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
18
- "doi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ج', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्']),
19
- "gu": set(['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ', 'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ', '઼', 'ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ે', 'ૈ', 'ો', 'ૌ', '્']),
20
- "hi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
21
- "kn": set(['ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಲ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 'ಳ', 'ಱ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್']),
22
- "ks": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
23
- "kok": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
24
- "mai": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
25
- "ml": set(['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', '്']),
26
- "mni": set(['ꯑ', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '꯺', '꯻', '꯼', '꯽', '꯾', '꯿', 'ꯀ', 'ꯂ', 'ꯃ', 'ꯄ', 'ꯅ', 'ꯆ', 'ꯇ', 'ꯈ', 'ꯉ', 'ꯊ', 'ꯋ', 'ꯌ', 'ꯍ', 'ꯎ', 'ꯏ', 'ꯐ', 'ꯑ']),
27
- "mr": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्', 'ळ']),
28
- "ne": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
29
- "or": set(['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଏ', 'ଐ', 'ଓ', 'ଔ', 'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ', 'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ବ', 'ଶ', 'ଷ', 'ସ', 'ହ', 'କ୍ଷ', 'ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'େ', 'ୈ', 'ୋ', 'ୌ', '୍']),
30
- "pa": set(['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ', 'ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ', 'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ', 'ਖ਼', 'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼', 'ਲ਼', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ', '੍']),
31
- "sa": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
32
- "sat": set(['ᱚ', 'ᱛ', 'ᱚ', 'ᱜ', 'ᱚ', 'ᱝ', 'ᱞ', 'ᱟ', 'ᱠ', 'ᱥ', 'ᱮ', 'ᱫ', 'ఇ', 'ᱤ', 'ᱩ', 'ੂ', 'େ', 'ୈ', 'ᱪ', 'ᱡ', 'ᱭ']),
33
- "sd": set(['ا', 'آ', 'ب', 'ڀ', 'ت', 'ٽ', 'ث', 'پ', 'ج', 'ڄ', 'جھ', 'چ', 'ح', 'خ', 'ڌ', 'د', 'ڏ', 'ڊ', 'ذ', 'ر', 'ز', 'ڙ', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ڦ', 'ق', 'ڪ', 'ک', 'گ', 'ڳ', 'ڱ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ي']),
34
- "ta": set(['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ', 'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ஷ', 'ஸ', 'ஹ']),
35
- "te": set(['అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్']),
36
- "ur": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
37
- }
38
 
39
- # Dictionary for common words of each language
40
- LANGUAGE_COMMON_WORDS = {
41
- "as": set(["আৰু", "হয়", "এটা", "কৰা", "ওপৰত", "যে"]),
42
- "bn": set(["এবং", "একটি", "করুন", "জন্য", "সঙ্গে", "হচ্ছে"]),
43
- "br": set(["आरो", "एसे", "मोनसे", "माव", "दं", "जा"]),
44
- "doi": set(["ते", "दे", "ऐ", "इक", "ओह्", "कर"]),
45
- "gu": set(["અને", "એક", "માટે", "છે", "સાથે", "કરવું"]),
46
- "hi": set(["और", "है", "एक", "में", "के", "लिए"]),
47
- "kn": set(["ಮತ್ತು", "ಒಂದು", "ಹೇಗೆ", "ನಾನು", "ಇದೆ", "ಆ"]),
48
- "ks": set([" تہٕ", "چھُ", "اکھ", "منز", "کیتھ", "छु", "छ"]),
49
- "kok": set(["आनी", "एक", "कर", "खातीर", "कडेन", "आसा"]),
50
- "mai": set(["आ", "एक", "हम", "अछि", "क'", "छै"]),
51
- "ml": set(["ഒരു", "കൂടാതെ", "എങ്ങനെ", "ഞാൻ", "ഇത്", "ആണ്"]),
52
- "mni": set(["ꯗꯥ", "ꯑꯃꯥ", "ꯀꯔꯤ", "ꯑꯩꯅꯥ", "ꯑꯁꯤ", "ꯂꯩ"]),
53
- "mr": set(["आणि", "एक", "आहे", "मी", "तू", "जे"]),
54
- "ne": set(["र", "एक", "हो", "म", "तिमी", "छ"]),
55
- "or": set(["ଏବଂ", "ଗୋଟିଏ", "କରନ୍ତୁ", "ପାଇଁ", "ସହିତ", "ଅଛି"]),
56
- "pa": set(["ਅਤੇ", "ਇੱਕ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ", "ਦਾ"]),
57
- "sa": set(["च", "एकः", "अस्ति", "अहम्", "त्वम्", "सः"]),
58
- "sat": set(["ᱟᱨ", "ᱫᱚ", "ᱢᱤᱫ", "ಒಂದು", "ಮತ್ತು", ""]),
59
- "sd": set(["۽", "هڪ", "آهي", "۾", "کي", "جو"]),
60
- "ta": set(["மற்றும்", "ஒரு", "வேண்டும்", "நான்", "இது", "ஆகும்"]),
61
- "te": set(["మరియు", "ఒక", "కావాలి", "నేను", "ఇది", "ఉంది"]),
62
- "ur": set(["اور", "ہے", "ایک", "میں", "کے", "لیے"]),
 
63
  }
64
 
65
- # Mapping from language code to its full name for display purposes
66
- LANGUAGE_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
 
 
 
 
 
 
 
 
67
  device = "cuda" if torch.cuda.is_available() else "cpu"
68
 
69
- # --- Model Loading ---
70
- print("Loading IndicConformer model...")
71
- model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
72
- model.eval()
73
- print("✅ Model loaded successfully.")
74
-
75
- # --- Core Logic ---
76
- def identify_language(text: str) -> str | None:
77
- """Identifies the language of a given text based on character sets and common words."""
78
- if not text.strip():
79
- return None
80
-
81
- scores = {lang: 0 for lang in LANGUAGE_CHARSETS.keys()}
82
- text_chars = set(text)
83
- text_words = set(re.split(r'[\s,.:;!?]+', text))
84
-
85
- for lang_code in scores.keys():
86
- char_score = len(text_chars.intersection(LANGUAGE_CHARSETS.get(lang_code, set())))
87
- word_score = len(text_words.intersection(LANGUAGE_COMMON_WORDS.get(lang_code, set())))
88
- scores[lang_code] = (char_score * 2) + word_score
89
-
90
- max_score = max(scores.values())
91
- if max_score < 3:
92
- return None
93
-
94
- identified_code = max(scores, key=scores.get)
95
- return identified_code
96
 
 
 
97
  @spaces.GPU
98
- def transcribe_and_identify(audio_path):
99
  if not audio_path:
100
  return "Please provide an audio file.", "", ""
101
 
102
  try:
 
103
  waveform, sr = torchaudio.load(audio_path)
104
- waveform = waveform.mean(dim=0, keepdim=True) if waveform.shape[0] > 1 else waveform
105
- waveform = torchaudio.functional.resample(waveform, sr, 16000).to(device)
106
  except Exception as e:
107
  return f"Error loading audio: {e}", "", ""
108
 
109
  try:
 
 
110
  with torch.no_grad():
111
- initial_transcription = model(waveform, "hi", "ctc")
112
 
113
- identified_lang_code = identify_language(initial_transcription)
 
 
 
114
 
115
- if not identified_lang_code:
116
- detected_lang_str = "Language not detected or unsupported."
117
- return detected_lang_str, initial_transcription + " (pivot)", "Could not perform final transcription."
 
 
 
118
 
119
- detected_lang_str = f"Detected Language: {LANGUAGE_CODE_TO_NAME.get(identified_lang_code, 'Unknown')}"
120
 
 
121
  with torch.no_grad():
122
- transcription_ctc = model(waveform, identified_lang_code, "ctc")
123
- transcription_rnnt = model(waveform, identified_lang_code, "rnnt")
 
124
 
125
  except Exception as e:
126
- return f"Error during transcription: {str(e)}", "", ""
127
 
128
  return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
129
 
130
- # --- Gradio UI ---
131
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
132
  gr.Markdown(f"## {DESCRIPTION}")
133
- gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription using both CTC and RNNT decoding.")
134
 
135
  with gr.Row():
136
  with gr.Column(scale=1):
@@ -145,7 +139,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
145
  ctc_output = gr.Textbox(lines=3, label="CTC Output")
146
 
147
  transcribe_btn.click(
148
- fn=transcribe_and_identify,
149
  inputs=[audio],
150
  outputs=[detected_lang_output, ctc_output, rnnt_output],
151
  api_name="transcribe"
 
1
+ from __future__ import annotationsfrom __future__ import annotations
2
  import torch
3
  import torchaudio
4
  import gradio as gr
5
  import spaces
6
+ from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
7
  import re
8
 
9
+ DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # --- ASR Model (The one we used before) ---
13
+ print("Loading ASR model (IndicConformer)...")
14
+ asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
15
+ asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
16
+ asr_model.eval()
17
+ print(" ASR Model loaded.")
18
+
19
+ # --- Language Identification (LID) Model ---
20
+ print("\nLoading Language ID model (MMS-LID)...")
21
+ lid_model_id = "facebook/mms-lid"
22
+ lid_processor = AutoProcessor.from_pretrained(lid_model_id)
23
+ lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
24
+ lid_model.eval()
25
+ print(" Language ID Model loaded.")
26
+
27
+
28
+ # --- Language Mappings ---
29
+ # Maps the LID model's output code to the ASR model's code
30
+ LID_TO_ASR_LANG_MAP = {
31
+ "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
32
+ "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
33
+ "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
34
+ "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
35
+ "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
36
+ "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
37
  }
38
 
39
+ # Maps the ASR model's code back to a full name for display
40
+ ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
41
+ import torch
42
+ import torchaudio
43
+ import gradio as gr
44
+ import spaces
45
+ from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
46
+ import re
47
+
48
+ DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
49
  device = "cuda" if torch.cuda.is_available() else "cpu"
50
 
51
+ # --- ASR Model (The one we used before) ---
52
+ print("Loading ASR model (IndicConformer)...")
53
+ asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
54
+ asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
55
+ asr_model.eval()
56
+ print("✅ ASR Model loaded.")
57
+
58
+ # --- Language Identification (LID) Model ---
59
+ print("\nLoading Language ID model (MMS-LID)...")
60
+ lid_model_id = "facebook/mms-lid"
61
+ lid_processor = AutoProcessor.from_pretrained(lid_model_id)
62
+ lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
63
+ lid_model.eval()
64
+ print("✅ Language ID Model loaded.")
65
+
66
+
67
+ # --- Language Mappings ---
68
+ # Maps the LID model's output code to the ASR model's code
69
+ LID_TO_ASR_LANG_MAP = {
70
+ "asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
71
+ "guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
72
+ "kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
73
+ "mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
74
+ "pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
75
+ "tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
76
+ }
 
77
 
78
+ # Maps the ASR model's code back to a full name for display
79
+ ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
80
  @spaces.GPU
81
+ def transcribe_audio_with_lid(audio_path):
82
  if not audio_path:
83
  return "Please provide an audio file.", "", ""
84
 
85
  try:
86
+ # Load and preprocess audio
87
  waveform, sr = torchaudio.load(audio_path)
88
+ # Resample for both models
89
+ waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
90
  except Exception as e:
91
  return f"Error loading audio: {e}", "", ""
92
 
93
  try:
94
+ # 1. --- Language Identification ---
95
+ inputs = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
96
  with torch.no_grad():
97
+ outputs = lid_model(**inputs)
98
 
99
+ # Get the top predicted language ID from the LID model
100
+ predicted_lid_id = outputs.logits.argmax(-1).item()
101
+ # The model.config.id2label gives us the language code like "hin_Deva"
102
+ detected_lid_code = lid_model.config.id2label[predicted_lid_id]
103
 
104
+ # 2. --- Map to ASR Language Code ---
105
+ asr_lang_code = LID_TO_ASR_LANG_MAP.get(detected_lid_code)
106
+
107
+ if not asr_lang_code:
108
+ detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
109
+ return detected_lang_str, "N/A", "N/A"
110
 
111
+ detected_lang_str = f"Detected Language: {ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')}"
112
 
113
+ # 3. --- Transcription using the detected language ---
114
  with torch.no_grad():
115
+ # Use the ASR model with the correctly identified language code
116
+ transcription_ctc = asr_model(waveform_16k.to(device), asr_lang_code, "ctc")
117
+ transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
118
 
119
  except Exception as e:
120
+ return f"Error during processing: {str(e)}", "", ""
121
 
122
  return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
123
 
124
+ # Gradio UI (no major changes needed here)
125
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
126
  gr.Markdown(f"## {DESCRIPTION}")
127
+ gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")
128
 
129
  with gr.Row():
130
  with gr.Column(scale=1):
 
139
  ctc_output = gr.Textbox(lines=3, label="CTC Output")
140
 
141
  transcribe_btn.click(
142
+ fn=transcribe_audio_with_lid,
143
  inputs=[audio],
144
  outputs=[detected_lang_output, ctc_output, rnnt_output],
145
  api_name="transcribe"