Update app.py
Browse files
app.py
CHANGED
|
@@ -1,136 +1,130 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
| 4 |
import gradio as gr
|
| 5 |
import spaces
|
| 6 |
-
from transformers import AutoModel
|
| 7 |
import re
|
| 8 |
|
| 9 |
-
DESCRIPTION = "IndicConformer
|
| 10 |
-
|
| 11 |
-
# --- Data Dictionaries ---
|
| 12 |
-
|
| 13 |
-
# Dictionary for character sets, now with improved formatting for readability.
|
| 14 |
-
LANGUAGE_CHARSETS = {
|
| 15 |
-
"as": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'ৰ', 'ল', 'ৱ', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্']),
|
| 16 |
-
"bn": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ড়', 'ঢ়', 'য়']),
|
| 17 |
-
"br": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ূ', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 18 |
-
"doi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ج', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 19 |
-
"gu": set(['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ', 'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ', '઼', 'ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ે', 'ૈ', 'ો', 'ૌ', '્']),
|
| 20 |
-
"hi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 21 |
-
"kn": set(['ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಲ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 'ಳ', 'ಱ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್']),
|
| 22 |
-
"ks": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
|
| 23 |
-
"kok": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 24 |
-
"mai": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 25 |
-
"ml": set(['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', '്']),
|
| 26 |
-
"mni": set(['ꯑ', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '', '', '', '', '', '', 'ꯀ', 'ꯂ', 'ꯃ', 'ꯄ', 'ꯅ', 'ꯆ', 'ꯇ', 'ꯈ', 'ꯉ', 'ꯊ', 'ꯋ', 'ꯌ', 'ꯍ', 'ꯎ', 'ꯏ', 'ꯐ', 'ꯑ']),
|
| 27 |
-
"mr": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्', 'ळ']),
|
| 28 |
-
"ne": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 29 |
-
"or": set(['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଏ', 'ଐ', 'ଓ', 'ଔ', 'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ', 'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ବ', 'ଶ', 'ଷ', 'ସ', 'ହ', 'କ୍ଷ', 'ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'େ', 'ୈ', 'ୋ', 'ୌ', '୍']),
|
| 30 |
-
"pa": set(['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ', 'ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ', 'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ', 'ਖ਼', 'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼', 'ਲ਼', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ', '੍']),
|
| 31 |
-
"sa": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 32 |
-
"sat": set(['ᱚ', 'ᱛ', 'ᱚ', 'ᱜ', 'ᱚ', 'ᱝ', 'ᱞ', 'ᱟ', 'ᱠ', 'ᱥ', 'ᱮ', 'ᱫ', 'ఇ', 'ᱤ', 'ᱩ', 'ੂ', 'େ', 'ୈ', 'ᱪ', 'ᱡ', 'ᱭ']),
|
| 33 |
-
"sd": set(['ا', 'آ', 'ب', 'ڀ', 'ت', 'ٽ', 'ث', 'پ', 'ج', 'ڄ', 'جھ', 'چ', 'ح', 'خ', 'ڌ', 'د', 'ڏ', 'ڊ', 'ذ', 'ر', 'ز', 'ڙ', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ڦ', 'ق', 'ڪ', 'ک', 'گ', 'ڳ', 'ڱ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ي']),
|
| 34 |
-
"ta": set(['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ', 'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ஷ', 'ஸ', 'ஹ']),
|
| 35 |
-
"te": set(['అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్']),
|
| 36 |
-
"ur": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
|
| 37 |
-
}
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
"
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
|
|
|
| 63 |
}
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 68 |
|
| 69 |
-
# --- Model
|
| 70 |
-
print("Loading
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
return identified_code
|
| 96 |
|
|
|
|
|
|
|
| 97 |
@spaces.GPU
|
| 98 |
-
def
|
| 99 |
if not audio_path:
|
| 100 |
return "Please provide an audio file.", "", ""
|
| 101 |
|
| 102 |
try:
|
|
|
|
| 103 |
waveform, sr = torchaudio.load(audio_path)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
except Exception as e:
|
| 107 |
return f"Error loading audio: {e}", "", ""
|
| 108 |
|
| 109 |
try:
|
|
|
|
|
|
|
| 110 |
with torch.no_grad():
|
| 111 |
-
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
-
detected_lang_str = f"Detected Language: {
|
| 120 |
|
|
|
|
| 121 |
with torch.no_grad():
|
| 122 |
-
|
| 123 |
-
|
|
|
|
| 124 |
|
| 125 |
except Exception as e:
|
| 126 |
-
return f"Error during
|
| 127 |
|
| 128 |
return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
|
| 129 |
|
| 130 |
-
#
|
| 131 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 132 |
gr.Markdown(f"## {DESCRIPTION}")
|
| 133 |
-
gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription
|
| 134 |
|
| 135 |
with gr.Row():
|
| 136 |
with gr.Column(scale=1):
|
|
@@ -145,7 +139,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 145 |
ctc_output = gr.Textbox(lines=3, label="CTC Output")
|
| 146 |
|
| 147 |
transcribe_btn.click(
|
| 148 |
-
fn=
|
| 149 |
inputs=[audio],
|
| 150 |
outputs=[detected_lang_output, ctc_output, rnnt_output],
|
| 151 |
api_name="transcribe"
|
|
|
|
| 1 |
+
from __future__ import annotationsfrom __future__ import annotations
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
| 4 |
import gradio as gr
|
| 5 |
import spaces
|
| 6 |
+
from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
|
| 7 |
import re
|
| 8 |
|
| 9 |
+
DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
|
| 10 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
# --- ASR Model (The one we used before) ---
|
| 13 |
+
print("Loading ASR model (IndicConformer)...")
|
| 14 |
+
asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
|
| 15 |
+
asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
|
| 16 |
+
asr_model.eval()
|
| 17 |
+
print("✅ ASR Model loaded.")
|
| 18 |
+
|
| 19 |
+
# --- Language Identification (LID) Model ---
|
| 20 |
+
print("\nLoading Language ID model (MMS-LID)...")
|
| 21 |
+
lid_model_id = "facebook/mms-lid"
|
| 22 |
+
lid_processor = AutoProcessor.from_pretrained(lid_model_id)
|
| 23 |
+
lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
|
| 24 |
+
lid_model.eval()
|
| 25 |
+
print("✅ Language ID Model loaded.")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# --- Language Mappings ---
|
| 29 |
+
# Maps the LID model's output code to the ASR model's code
|
| 30 |
+
LID_TO_ASR_LANG_MAP = {
|
| 31 |
+
"asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
|
| 32 |
+
"guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
|
| 33 |
+
"kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
|
| 34 |
+
"mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
|
| 35 |
+
"pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
|
| 36 |
+
"tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
|
| 37 |
}
|
| 38 |
|
| 39 |
+
# Maps the ASR model's code back to a full name for display
|
| 40 |
+
ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
|
| 41 |
+
import torch
|
| 42 |
+
import torchaudio
|
| 43 |
+
import gradio as gr
|
| 44 |
+
import spaces
|
| 45 |
+
from transformers import AutoModel, AutoProcessor, Wav2Vec2ForCTC
|
| 46 |
+
import re
|
| 47 |
+
|
| 48 |
+
DESCRIPTION = "IndicConformer ASR with Automatic Language Identification"
|
| 49 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 50 |
|
| 51 |
+
# --- ASR Model (The one we used before) ---
|
| 52 |
+
print("Loading ASR model (IndicConformer)...")
|
| 53 |
+
asr_model_id = "ai4bharat/indic-conformer-600m-multilingual"
|
| 54 |
+
asr_model = AutoModel.from_pretrained(asr_model_id, trust_remote_code=True).to(device)
|
| 55 |
+
asr_model.eval()
|
| 56 |
+
print("✅ ASR Model loaded.")
|
| 57 |
+
|
| 58 |
+
# --- Language Identification (LID) Model ---
|
| 59 |
+
print("\nLoading Language ID model (MMS-LID)...")
|
| 60 |
+
lid_model_id = "facebook/mms-lid"
|
| 61 |
+
lid_processor = AutoProcessor.from_pretrained(lid_model_id)
|
| 62 |
+
lid_model = AutoModel.from_pretrained(lid_model_id).to(device)
|
| 63 |
+
lid_model.eval()
|
| 64 |
+
print("✅ Language ID Model loaded.")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# --- Language Mappings ---
|
| 68 |
+
# Maps the LID model's output code to the ASR model's code
|
| 69 |
+
LID_TO_ASR_LANG_MAP = {
|
| 70 |
+
"asm_Beng": "as", "ben_Beng": "bn", "brx_Deva": "br", "doi_Deva": "doi",
|
| 71 |
+
"guj_Gujr": "gu", "hin_Deva": "hi", "kan_Knda": "kn", "kas_Arab": "ks",
|
| 72 |
+
"kas_Deva": "ks", "gom_Deva": "kok", "mai_Deva": "mai", "mal_Mlym": "ml",
|
| 73 |
+
"mni_Beng": "mni", "mar_Deva": "mr", "nep_Deva": "ne", "ory_Orya": "or",
|
| 74 |
+
"pan_Guru": "pa", "san_Deva": "sa", "sat_Olck": "sat", "snd_Arab": "sd",
|
| 75 |
+
"tam_Taml": "ta", "tel_Telu": "te", "urd_Arab": "ur"
|
| 76 |
+
}
|
|
|
|
| 77 |
|
| 78 |
+
# Maps the ASR model's code back to a full name for display
|
| 79 |
+
ASR_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
|
| 80 |
@spaces.GPU
|
| 81 |
+
def transcribe_audio_with_lid(audio_path):
|
| 82 |
if not audio_path:
|
| 83 |
return "Please provide an audio file.", "", ""
|
| 84 |
|
| 85 |
try:
|
| 86 |
+
# Load and preprocess audio
|
| 87 |
waveform, sr = torchaudio.load(audio_path)
|
| 88 |
+
# Resample for both models
|
| 89 |
+
waveform_16k = torchaudio.functional.resample(waveform, sr, 16000)
|
| 90 |
except Exception as e:
|
| 91 |
return f"Error loading audio: {e}", "", ""
|
| 92 |
|
| 93 |
try:
|
| 94 |
+
# 1. --- Language Identification ---
|
| 95 |
+
inputs = lid_processor(waveform_16k.squeeze(), sampling_rate=16000, return_tensors="pt").to(device)
|
| 96 |
with torch.no_grad():
|
| 97 |
+
outputs = lid_model(**inputs)
|
| 98 |
|
| 99 |
+
# Get the top predicted language ID from the LID model
|
| 100 |
+
predicted_lid_id = outputs.logits.argmax(-1).item()
|
| 101 |
+
# The model.config.id2label gives us the language code like "hin_Deva"
|
| 102 |
+
detected_lid_code = lid_model.config.id2label[predicted_lid_id]
|
| 103 |
|
| 104 |
+
# 2. --- Map to ASR Language Code ---
|
| 105 |
+
asr_lang_code = LID_TO_ASR_LANG_MAP.get(detected_lid_code)
|
| 106 |
+
|
| 107 |
+
if not asr_lang_code:
|
| 108 |
+
detected_lang_str = f"Detected '{detected_lid_code}', which is not supported by the ASR model."
|
| 109 |
+
return detected_lang_str, "N/A", "N/A"
|
| 110 |
|
| 111 |
+
detected_lang_str = f"Detected Language: {ASR_CODE_TO_NAME.get(asr_lang_code, 'Unknown')}"
|
| 112 |
|
| 113 |
+
# 3. --- Transcription using the detected language ---
|
| 114 |
with torch.no_grad():
|
| 115 |
+
# Use the ASR model with the correctly identified language code
|
| 116 |
+
transcription_ctc = asr_model(waveform_16k.to(device), asr_lang_code, "ctc")
|
| 117 |
+
transcription_rnnt = asr_model(waveform_16k.to(device), asr_lang_code, "rnnt")
|
| 118 |
|
| 119 |
except Exception as e:
|
| 120 |
+
return f"Error during processing: {str(e)}", "", ""
|
| 121 |
|
| 122 |
return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
|
| 123 |
|
| 124 |
+
# Gradio UI (no major changes needed here)
|
| 125 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 126 |
gr.Markdown(f"## {DESCRIPTION}")
|
| 127 |
+
gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription.")
|
| 128 |
|
| 129 |
with gr.Row():
|
| 130 |
with gr.Column(scale=1):
|
|
|
|
| 139 |
ctc_output = gr.Textbox(lines=3, label="CTC Output")
|
| 140 |
|
| 141 |
transcribe_btn.click(
|
| 142 |
+
fn=transcribe_audio_with_lid,
|
| 143 |
inputs=[audio],
|
| 144 |
outputs=[detected_lang_output, ctc_output, rnnt_output],
|
| 145 |
api_name="transcribe"
|