Spaces:

SagarVelamuri
/

TranslationSpace

Sleeping

App Files Files Community

SagarVelamuri commited on Oct 13

Commit

fffb78c

verified ·

1 Parent(s): a0b4a31

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -124

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os, re, types, traceback, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from IndicTransToolkit import IndicProcessor
 # --------------------- Device ---------------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -13,64 +14,18 @@ TE_CODE  = "tel_Telu"
 ip = IndicProcessor(inference=True)
-# --------------------- Regex / Helpers ---------------------
-TAG_REGEX = re.compile(
-    r"(?:_src\S+)|(?:tgt\S+)|"
-    r"(?:>>\s*\S+\s*<<)|"
-    r"\b(?:eng_Latn|hin_Deva|hin_deva|tel_Telu|tel_telu)\b|"
-    r"<ID\d*>"
-)
-def strip_lang_tags(text: str) -> str:
-    s = TAG_REGEX.sub(" ", text)
-    return re.sub(r"\s{2,}", " ", s).strip()
-def ensure_hindi_danda(s: str) -> str:
-    s = re.sub(r"\.\s*$", "।", s)
-    if not re.search(r"[।?!…]\s*$", s) and re.search(r"[\u0900-\u097F]\s*$", s):
-        s += "।"
-    return s
-# Sentence splitting (pysbd or fallback)
-try:
-    import pysbd
-    _SEGMENTER = pysbd.Segmenter(language="en", clean=True)
-except Exception:
-    _SEGMENTER = None
-_LEGAL_JOIN_RE = re.compile(r'\b([A-Za-z]{1,6})\.\s*$')
-_NEXT_CONT_RE  = re.compile(r'^\s*(?:[\(\[\{]|\d|[a-z])')
-def _merge_legal_abbrev_breaks(sents):
-    merged, i = [], 0
-    while i < len(sents):
-        cur = sents[i].strip()
-        while i + 1 < len(sents):
-            nxt = sents[i + 1].lstrip()
-            if _LEGAL_JOIN_RE.search(cur) and _NEXT_CONT_RE.match(nxt):
-                cur = f"{cur} {nxt}"
-                i += 1
-            else:
-                break
-        merged.append(cur)
-        i += 1
-    return [s for s in merged if s]
-def split_into_sentences(text: str):
-    if _SEGMENTER is not None:
-        return _merge_legal_abbrev_breaks(_SEGMENTER.segment(text))
-    PLACEHOLDER = "\uE000"
-    protected = re.sub(
-        r'\b([A-Za-z]{1,6})\.(?=\s*(?:[\(\[\{]|\d|[a-z]))',
-        r'\1' + PLACEHOLDER, text.strip()
-    )
-    protected = re.sub(
-        r'\b([A-Za-z]{1,5})\.(?=\s+[A-Z])',
-        r'\1' + PLACEHOLDER, protected
-    )
-    parts = re.split(r'(?<=[.?!])\s+', protected)
-    return _merge_legal_abbrev_breaks([p.replace(PLACEHOLDER, '.') for p in parts if p.strip()])
 # --------------------- Model Loader ---------------------
 MODELS = {
@@ -92,10 +47,10 @@ def load_model(model_name: str):
     )
     mdl = AutoModelForSeq2SeqLM.from_pretrained(
         model_name, trust_remote_code=True,
-        low_cpu_mem_usage=True, dtype=dtype, token = token
     ).to(device).eval()
-    # Fix vocab (some HF models have mismatched config.vocab_size)
     try:
         mdl.config.vocab_size = mdl.get_output_embeddings().weight.shape[0]
     except Exception:
@@ -104,100 +59,63 @@ def load_model(model_name: str):
     _model_cache[model_name] = (tok, mdl)
     return tok, mdl
-def build_bad_words_ids_from_vocab(tok):
-    vocab = tok.get_vocab()
-    candidates = [
-        "eng_Latn","hin_Deva","hin_deva","tel_Telu","tel_telu",
-        "_srceng_Latn","tgthin_Deva","tgt_tel_Telu",
-        ">>hin_Deva<<",">>tel_Telu<<",
-    ] + [f"<ID{i}>" for i in range(10)]
-    out = []
-    for c in candidates:
-        if c in vocab:
-            out.append([vocab[c]])
-            continue
-        sp_c = "▁" + c
-        if sp_c in vocab:
-            out.append([vocab[sp_c]])
-    return out
 # --------------------- Streaming Translation ---------------------
-BATCH_SIZE = 6
 @torch.inference_mode()
 def translate_dual_stream(text, model_choice, num_beams, max_new):
-    """
-    Generator that yields (hindi_accumulated_text, telugu_accumulated_text)
-    after each processed batch so the UI updates progressively.
-    """
     if not text or not text.strip():
         yield "", ""
         return
-    # Prepare once
     tok, mdl = load_model(MODELS[model_choice])
-    BAD_WORDS_IDS = build_bad_words_ids_from_vocab(tok)
     sentences = split_into_sentences(text)
     hi_acc, te_acc = [], []
-    # Clear outputs immediately for a snappy feel
     yield "", ""
-    for i in range(0, len(sentences), BATCH_SIZE):
-        batch = sentences[i:i + BATCH_SIZE]
-        # --- Hindi batch ---
         try:
-            proc_hi = ip.preprocess_batch(batch, src_lang=SRC_CODE, tgt_lang=HI_CODE)
-            enc_hi  = tok(
-                proc_hi, padding=True, truncation=True, max_length=256, return_tensors="pt"
-            ).to(device)
-            out_hi  = mdl.generate(
                 **enc_hi,
-                max_length=max_new,              # keep semantics same as your original
                 num_beams=int(num_beams),
                 early_stopping=True,
                 no_repeat_ngram_size=3,
-                use_cache=False,
-                bad_words_ids=BAD_WORDS_IDS if BAD_WORDS_IDS else None
             )
-            dec_hi  = tok.batch_decode(out_hi, skip_special_tokens=True)
-            dec_hi  = [strip_lang_tags(t) for t in dec_hi]
             post_hi = ip.postprocess_batch(dec_hi, lang=HI_CODE)
-            post_hi = [ensure_hindi_danda(x) for x in post_hi]
-            hi_acc.extend(p.strip() for p in post_hi)
         except Exception as e:
-            hi_acc.append(f"⚠️ Hindi failed (batch {i//BATCH_SIZE+1}): {e}")
-        # --- Telugu batch ---
         try:
-            proc_te = ip.preprocess_batch(batch, src_lang=SRC_CODE, tgt_lang=TE_CODE)
-            enc_te  = tok(
-                proc_te, padding=True, truncation=True, max_length=256, return_tensors="pt"
-            ).to(device)
-            out_te  = mdl.generate(
                 **enc_te,
-                max_length=max_new,
                 num_beams=int(num_beams),
                 early_stopping=True,
                 no_repeat_ngram_size=3,
-                use_cache=False,
-                bad_words_ids=BAD_WORDS_IDS if BAD_WORDS_IDS else None
             )
-            dec_te  = tok.batch_decode(out_te, skip_special_tokens=True)
-            dec_te  = [strip_lang_tags(t) for t in dec_te]
             post_te = ip.postprocess_batch(dec_te, lang=TE_CODE)
-            te_acc.extend(p.strip() for p in post_te)
         except Exception as e:
-            te_acc.append(f"⚠️ Telugu failed (batch {i//BATCH_SIZE+1}): {e}")
-        # Stream the accumulators so far
         yield (" ".join(hi_acc), " ".join(te_acc))
 # --------------------- Dark Theme ---------------------
 THEME = gr.themes.Soft(
     primary_hue="blue", neutral_hue="slate"
@@ -267,13 +185,13 @@ button { border-radius:8px !important; font-weight:600 !important; }
 with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
-        gr.Markdown("<p>IndicTrans2 with batch sentence decomposition</p>")
     model_choice = gr.Dropdown(
         label="Choose Model",
         choices=list(MODELS.keys()),
         value="Default (Public)",
-        elem_id="model_dd"          # <-- for targeted styling
     )
     with gr.Row():
@@ -296,10 +214,10 @@ with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as
         with gr.Column(scale=1):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Settings</h2>")
-                num_beams   = gr.Slider(1, 8, value=4, step=1, label="Beam Search", elem_id="model_dd")
-                max_new     = gr.Slider(32, 512, value=128, step=16, label="Max New Tokens", elem_id="model_dd")
-    # Use streaming generator
     translate_btn.click(
         translate_dual_stream,
         inputs=[src, model_choice, num_beams, max_new],

 import os, re, types, traceback, torch, gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from IndicTransToolkit import IndicProcessor
+import spacy
 # --------------------- Device ---------------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ip = IndicProcessor(inference=True)
+# --------------------- Sentence Splitting (spaCy) ---------------------
+nlp = spacy.load("en_core_web_sm")
+def split_into_sentences(text):
+    """Split English text into sentences using spaCy."""
+    doc = nlp(text.strip())
+    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
+# --------------------- Cleanup Helper ---------------------
+def clean_translation(text):
+    """Remove unresolved placeholder tags such as <ID1>, <ID2>."""
+    return re.sub(r"<ID\d+>", "", text).strip()
 # --------------------- Model Loader ---------------------
 MODELS = {
     )
     mdl = AutoModelForSeq2SeqLM.from_pretrained(
         model_name, trust_remote_code=True,
+        low_cpu_mem_usage=True, dtype=dtype, token=token
     ).to(device).eval()
+    # Fix vocab mismatch if any
     try:
         mdl.config.vocab_size = mdl.get_output_embeddings().weight.shape[0]
     except Exception:
     _model_cache[model_name] = (tok, mdl)
     return tok, mdl
 # --------------------- Streaming Translation ---------------------
 @torch.inference_mode()
 def translate_dual_stream(text, model_choice, num_beams, max_new):
+    """Generator that yields progressive Hindi & Telugu translations one sentence at a time."""
     if not text or not text.strip():
         yield "", ""
         return
     tok, mdl = load_model(MODELS[model_choice])
     sentences = split_into_sentences(text)
     hi_acc, te_acc = [], []
+    # Yield empty for immediate UI update
     yield "", ""
+    for i, sentence in enumerate(sentences, 1):
+        # --- Hindi Translation ---
         try:
+            batch_hi = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=HI_CODE)
+            enc_hi = tok(batch_hi, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
+            out_hi = mdl.generate(
                 **enc_hi,
+                max_length=int(max_new),
                 num_beams=int(num_beams),
+                do_sample=False,
                 early_stopping=True,
                 no_repeat_ngram_size=3,
+                use_cache=False
             )
+            dec_hi = tok.batch_decode(out_hi, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             post_hi = ip.postprocess_batch(dec_hi, lang=HI_CODE)
+            hi_acc.append(clean_translation(post_hi[0]))
         except Exception as e:
+            hi_acc.append(f"⚠️ Hindi failed (sentence {i}): {e}")
+        # --- Telugu Translation ---
         try:
+            batch_te = ip.preprocess_batch([sentence], src_lang=SRC_CODE, tgt_lang=TE_CODE)
+            enc_te = tok(batch_te, max_length=256, truncation=True, padding=True, return_tensors="pt").to(device)
+            out_te = mdl.generate(
                 **enc_te,
+                max_length=int(max_new),
                 num_beams=int(num_beams),
+                do_sample=False,
                 early_stopping=True,
                 no_repeat_ngram_size=3,
+                use_cache=False
             )
+            dec_te = tok.batch_decode(out_te, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             post_te = ip.postprocess_batch(dec_te, lang=TE_CODE)
+            te_acc.append(clean_translation(post_te[0]))
         except Exception as e:
+            te_acc.append(f"⚠️ Telugu failed (sentence {i}): {e}")
+        # Stream progressive output
         yield (" ".join(hi_acc), " ".join(te_acc))
 # --------------------- Dark Theme ---------------------
 THEME = gr.themes.Soft(
     primary_hue="blue", neutral_hue="slate"
 with gr.Blocks(theme=THEME, css=CUSTOM_CSS, title="EN → HI/TE Translator") as demo:
     with gr.Group(elem_id="hdr"):
         gr.Markdown("<h1>English → Hindi & Telugu Translator</h1>")
+        gr.Markdown("<p>IndicTrans2 with simplified preprocessing and sentence-wise translation</p>")
     model_choice = gr.Dropdown(
         label="Choose Model",
         choices=list(MODELS.keys()),
         value="Default (Public)",
+        elem_id="model_dd"
     )
     with gr.Row():
         with gr.Column(scale=1):
             with gr.Group(elem_classes="panel"):
                 gr.Markdown("<h2>Settings</h2>")
+                num_beams = gr.Slider(1, 8, value=4, step=1, label="Beam Search", elem_id="model_dd")
+                max_new   = gr.Slider(32, 512, value=128, step=16, label="Max New Tokens", elem_id="model_dd")
+    # Stream generator connection
     translate_btn.click(
         translate_dual_stream,
         inputs=[src, model_choice, num_beams, max_new],