Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 3

Commit

1d46eb9

1 Parent(s): cfa5d44

Upd vietnamese transl

Browse files

Files changed (12) hide show

Dockerfile +3 -0
README.md +12 -12
app.py +45 -9
requirements.txt +2 -0
trans_test.py +78 -0
utils/processor.py +30 -18
utils/rag.py +23 -8
vi/README.md +95 -0
vi/__init__.py +10 -0
vi/download.py +89 -0
vi/processing.py +95 -0
vi/translator.py +266 -0

Dockerfile CHANGED Viewed

@@ -16,6 +16,9 @@ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
 # Copy the application
 COPY --chown=user . .
 # Hugging Face cache setup
 ENV HF_HOME="$HOME/.cache/huggingface"
 ENV SENTENCE_TRANSFORMERS_HOME="$HOME/.cache/huggingface/sentence-transformers"

 # Copy the application
 COPY --chown=user . .
+# Download Vietnamese translation model
+RUN python vi/download.py
 # Hugging Face cache setup
 ENV HF_HOME="$HOME/.cache/huggingface"
 ENV SENTENCE_TRANSFORMERS_HOME="$HOME/.cache/huggingface/sentence-transformers"

README.md CHANGED Viewed

@@ -1,32 +1,32 @@
 ---
-title: MedAI Processing
 emoji: ⚕️
-colorFrom: indigo
-colorTo: blue
 sdk: docker
 pinned: false
 license: apache-2.0
-short_description: Process and centralise medical doc for llm finetuning
 ---
 ## Quick Access:
-[HF Space](https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing)
-[MedDialog-100k](https://huggingface.co/datasets/BinKhoaLe1812/MedDialog-EN-100k)
-[MedDialog-100k](https://huggingface.co/datasets/BinKhoaLe1812/MedDialog-EN-10k)
-[PubMedQA-Labelled](https://huggingface.co/datasets/BinKhoaLe1812/PubMedQA-L)
-[PubMedQA-Unlabelled](https://huggingface.co/datasets/BinKhoaLe1812/PubMedQA-U)
-[PubMedQA-Mapper](https://huggingface.co/datasets/BinKhoaLe1812/PubMedQA-MAP)
 ## CURL Request Instruction
-[Request Doc](https://huggingface.co/spaces/MedAI-COS30018/MedAI_Processing/blob/main/REQUEST.md)
 ## License
-[Apache-2.0 LICENSE](https://huggingface.co/spaces/MedAI-COS30018/MedAI_Processing/blob/main/LICENSE.txt)

 ---
+title: MedVietAI Processing
 emoji: ⚕️
+colorFrom: green
+colorTo: pink
 sdk: docker
 pinned: false
 license: apache-2.0
+short_description: Data processing with en-vi translation. Derived from 500k mi
 ---
 ## Quick Access:
+[HF Space](https://huggingface.co/spaces/MedVietAI/processing)
+[MedDialog-100k](https://huggingface.co/datasets/MedAI-COS30018/MedDialog-EN-100k)
+[MedDialog-100k](https://huggingface.co/datasets/MedAI-COS30018/MedDialog-EN-10k)
+[PubMedQA-Labelled](https://huggingface.co/datasets/MedAI-COS30018/PubMedQA-L)
+[PubMedQA-Unlabelled](https://huggingface.co/datasets/MedAI-COS30018/PubMedQA-U)
+[PubMedQA-Mapper](https://huggingface.co/datasets/MedAI-COS30018/PubMedQA-MAP)
 ## CURL Request Instruction
+[Request Doc](https://huggingface.co/spaces/MedVietAI/processing/blob/main/REQUEST.md)
 ## License
+[Apache-2.0 LICENSE](https://huggingface.co/spaces/MedVietAI/processing/blob/main/LICENSE.txt)

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from utils.drive_saver import DriveSaver
 from utils.llm import Paraphraser
 from utils.schema import CentralisedWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
 # ────────── Log ───────────
 logger = logging.getLogger("app")
@@ -53,6 +54,9 @@ paraphraser = Paraphraser(
     gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
 )
 app = FastAPI(title="Medical Dataset Augmenter", version="1.1.0")
 STATE_LOCK = threading.Lock()
@@ -85,6 +89,7 @@ class ProcessParams(BaseModel):
     sample_limit: Optional[int] = None    # Set data sampling if needed
     seed: int = 42
     rag_processing: bool = False          # Enable RAG-specific processing
 def set_state(**kwargs):
     with STATE_LOCK:
@@ -122,6 +127,14 @@ def root():
       <div class="section">
         <h2>⚡ Quick Actions</h2>
         <p>Click a button below to start processing a dataset with default augmentation parameters.</p>
         <button onclick="startJob('healthcaremagic')">▶ProcAugment HealthCareMagic (100k)</button><br>
         <button onclick="startJob('icliniq')">▶ProcAugment iCliniq (10k-derived)</button><br>
         <button onclick="startJob('pubmedqa_l')">▶ProcAugment PubMedQA (Labelled)</button><br>
@@ -155,10 +168,10 @@ def root():
       <script>
         async function startJob(dataset) {{
           const log = document.getElementById("log");
-          const ragToggle = document.getElementById("ragToggle");
-          const isRagMode = ragToggle.checked;
-          log.innerHTML = "⏳ Starting " + (isRagMode ? "RAG " : "") + "job for <b>" + dataset + "</b>...";
           try {{
             const resp = await fetch("/process/" + dataset, {{
               method: "POST",
@@ -177,7 +190,8 @@ def root():
                 }},
                 sample_limit: null,          // Sample down (currently disabled)
                 seed: 42,
-                rag_processing: isRagMode
               }})
             }});
             const data = await resp.json();
@@ -193,14 +207,18 @@ def root():
         async function startRagJob(dataset) {{
           const log = document.getElementById("log");
-          log.innerHTML = "⏳ Starting RAG processing for <b>" + dataset + "</b>...";
           try {{
             const resp = await fetch("/rag/" + dataset, {{
               method: "POST",
               headers: {{ "Content-Type": "application/json" }},
               body: JSON.stringify({{
                 sample_limit: null,
-                seed: 42
               }})
             }});
             const data = await resp.json();
@@ -366,6 +384,18 @@ def _run_job(dataset_key: str, params: ProcessParams):
         # Writer
         writer = CentralisedWriter(jsonl_path=jsonl_path, csv_path=csv_path)
         if params.rag_processing:
             # RAG processing mode
             set_state(message="RAG processing", progress=0.1)
@@ -376,20 +406,26 @@ def _run_job(dataset_key: str, params: ProcessParams):
                 nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
                 sample_limit=params.sample_limit,
                 seed=params.seed,
-                progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"])
             )
         else:
             # Standard SFT processing mode
             set_state(message="SFT processing", progress=0.1)
             count, stats = process_file_into_sft(
                 dataset_key=dataset_key,
                 input_path=local_path,
                 writer=writer,
                 paraphraser=paraphraser,
-                augment_opts=params.augment.dict(),
                 sample_limit=params.sample_limit,
                 seed=params.seed,
-                progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"])
             )
         logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
         writer.close()

 from utils.llm import Paraphraser
 from utils.schema import CentralisedWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
+from vi.translator import VietnameseTranslator
 # ────────── Log ───────────
 logger = logging.getLogger("app")
     gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
 )
+# Vietnamese translator
+vietnamese_translator = VietnameseTranslator()
 app = FastAPI(title="Medical Dataset Augmenter", version="1.1.0")
 STATE_LOCK = threading.Lock()
     sample_limit: Optional[int] = None    # Set data sampling if needed
     seed: int = 42
     rag_processing: bool = False          # Enable RAG-specific processing
+    vietnamese_translation: bool = False  # Enable Vietnamese translation
 def set_state(**kwargs):
     with STATE_LOCK:
       <div class="section">
         <h2>⚡ Quick Actions</h2>
         <p>Click a button below to start processing a dataset with default augmentation parameters.</p>
+        <div style="margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 5px; border-left: 4px solid #2d89ef;">
+          <label style="display: flex; align-items: center; cursor: pointer;">
+            <input type="checkbox" id="vietnameseTranslation" style="margin-right: 8px; transform: scale(1.2);">
+            <strong>🇻🇳 Vietnamese Translation</strong> - Translate all content to Vietnamese before processing
+          </label>
+        </div>
         <button onclick="startJob('healthcaremagic')">▶ProcAugment HealthCareMagic (100k)</button><br>
         <button onclick="startJob('icliniq')">▶ProcAugment iCliniq (10k-derived)</button><br>
         <button onclick="startJob('pubmedqa_l')">▶ProcAugment PubMedQA (Labelled)</button><br>
       <script>
         async function startJob(dataset) {{
           const log = document.getElementById("log");
+          const vietnameseToggle = document.getElementById("vietnameseTranslation");
+          const isVietnameseMode = vietnameseToggle.checked;
+          log.innerHTML = "⏳ Starting job for <b>" + dataset + "</b>" + (isVietnameseMode ? " with Vietnamese translation" : "") + "...";
           try {{
             const resp = await fetch("/process/" + dataset, {{
               method: "POST",
                 }},
                 sample_limit: null,          // Sample down (currently disabled)
                 seed: 42,
+                rag_processing: false,
+                vietnamese_translation: isVietnameseMode
               }})
             }});
             const data = await resp.json();
         async function startRagJob(dataset) {{
           const log = document.getElementById("log");
+          const vietnameseToggle = document.getElementById("vietnameseTranslation");
+          const isVietnameseMode = vietnameseToggle.checked;
+          log.innerHTML = "⏳ Starting RAG processing for <b>" + dataset + "</b>" + (isVietnameseMode ? " with Vietnamese translation" : "") + "...";
           try {{
             const resp = await fetch("/rag/" + dataset, {{
               method: "POST",
               headers: {{ "Content-Type": "application/json" }},
               body: JSON.stringify({{
                 sample_limit: null,
+                seed: 42,
+                vietnamese_translation: isVietnameseMode
               }})
             }});
             const data = await resp.json();
         # Writer
         writer = CentralisedWriter(jsonl_path=jsonl_path, csv_path=csv_path)
+        # Load translator if Vietnamese translation is requested
+        translator = None
+        if params.vietnamese_translation:
+            set_state(message="Loading Vietnamese translator", progress=0.05)
+            try:
+                vietnamese_translator.load_model()
+                translator = vietnamese_translator
+                logger.info("✅ Vietnamese translator loaded successfully")
+            except Exception as e:
+                logger.error(f"❌ Failed to load Vietnamese translator: {e}")
+                set_state(message=f"Warning: Vietnamese translation failed - {e}", progress=0.1)
         if params.rag_processing:
             # RAG processing mode
             set_state(message="RAG processing", progress=0.1)
                 nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
                 sample_limit=params.sample_limit,
                 seed=params.seed,
+                progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
+                translator=translator
             )
         else:
             # Standard SFT processing mode
             set_state(message="SFT processing", progress=0.1)
+            # Add Vietnamese translation flag to augment options
+            augment_opts = params.augment.dict()
+            augment_opts["vietnamese_translation"] = params.vietnamese_translation
             count, stats = process_file_into_sft(
                 dataset_key=dataset_key,
                 input_path=local_path,
                 writer=writer,
                 paraphraser=paraphraser,
+                augment_opts=augment_opts,
                 sample_limit=params.sample_limit,
                 seed=params.seed,
+                progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
+                translator=translator
             )
         logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
         writer.close()

requirements.txt CHANGED Viewed

@@ -11,3 +11,5 @@ google-auth-oauthlib
 orjson
 ftfy
 langid

 orjson
 ftfy
 langid
+transformers
+torch

trans_test.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python3
+"""
+Test script for Vietnamese translation functionality
+"""
+import os
+import sys
+import logging
+from dotenv import load_dotenv
+# Add the current directory to Python path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from vi.translator import VietnameseTranslator
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_translation():
+    """Test the Vietnamese translation functionality"""
+    load_dotenv()
+    # Initialize translator
+    translator = VietnameseTranslator()
+    try:
+        # Load the model
+        logger.info("Loading translation model...")
+        translator.load_model()
+        logger.info("✅ Model loaded successfully")
+        # Test single text translation
+        test_text = "Hello, how are you today? I hope you are feeling well."
+        logger.info(f"Original text: {test_text}")
+        translated = translator.translate_text(test_text)
+        logger.info(f"Translated text: {translated}")
+        # Test batch translation
+        test_texts = [
+            "What are the symptoms of diabetes?",
+            "How do I treat a headache?",
+            "What is the recommended dosage for this medication?"
+        ]
+        logger.info("Testing batch translation...")
+        batch_translated = translator.translate_batch(test_texts)
+        for i, (original, translated) in enumerate(zip(test_texts, batch_translated)):
+            logger.info(f"Batch {i+1}:")
+            logger.info(f"  Original: {original}")
+            logger.info(f"  Translated: {translated}")
+        # Test dictionary translation
+        test_dict = {
+            "instruction": "Answer the medical question",
+            "input": "What are the side effects of aspirin?",
+            "output": "Common side effects include stomach irritation and bleeding."
+        }
+        logger.info("Testing dictionary translation...")
+        dict_translated = translator.translate_dict(test_dict, ["instruction", "input", "output"])
+        logger.info("Dictionary translation result:")
+        for key, value in dict_translated.items():
+            logger.info(f"  {key}: {value}")
+        logger.info("🎉 All translation tests completed successfully!")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Translation test failed: {e}")
+        return False
+if __name__ == "__main__":
+    success = test_translation()
+    sys.exit(0 if success else 1)

utils/processor.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Callable, Optional, Dict, Tuple
 from utils.schema import sft_row
 from utils import augment as A
 # Logger
 logger = logging.getLogger("processor")
@@ -40,7 +41,8 @@ def process_file_into_sft(
     augment_opts: Dict,
     sample_limit: Optional[int],
     seed: int,
-    progress_cb: Optional[Callable[[float, str], None]]
 ) -> Tuple[int, Dict]:
     random.seed(seed)
     stats = {
@@ -68,13 +70,13 @@ def process_file_into_sft(
     if key in ("healthcaremagic", "icliniq"):
         count = _proc_med_dialog(source=key, path=input_path, writer=writer,
                                  paraphraser=paraphraser, opts=augment_opts,
-                                 sample_limit=sample_limit, stats=stats, cb=progress_cb, dedupe_seen=dedupe_seen)
     elif key == "pubmedqa_l":
-        count = _proc_pubmedqa_l(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen)
     elif key == "pubmedqa_u":
-        count = _proc_pubmedqa_u(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen)
     elif key == "pubmedqa_map":
-        count = _proc_pubmedqa_map(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen)
     else:
         raise ValueError(f"Unknown dataset: {dataset_key}")
     logger.info(f"[PROC] End dataset={dataset_key} stats={stats}")
@@ -135,7 +137,7 @@ def _apply_aug(instr: str, user: str, out: str, source: str, opts: Dict, paraphr
     return instr, user, out, applied
-def _commit_row(writer, source, rid, task, instr, user, out, opts, stats, aug_applied, extra_meta=None, dedupe_seen=None):
     # Dedup entry
     if dedupe_seen is not None:
         fp = A.fingerprint(instr, user, out)
@@ -149,13 +151,23 @@ def _commit_row(writer, source, rid, task, instr, user, out, opts, stats, aug_ap
         meta.update(extra_meta)
     row = sft_row(instr, user, out, source=source, rid=rid, task=task, meta=meta)
     writer.write(row)
     stats["written"] += 1
     return True
 # ——————————— dataset processors ———————————
-def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None):
     count = 0
     written = 0
     for i, obj in enumerate(_iter_json_or_jsonl(path), start=1):
@@ -184,12 +196,12 @@ def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stat
                 applied.append("consistency_flag")
             # 2) If expansion is enabled, add augmented copies
-            _commit_row(writer, source, rid, "medical_dialogue", instr, user, out, opts, stats, ["base"] + applied, dedupe_seen=dedupe_seen)
             # Add augmented copies if expand
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
-                    _commit_row(writer, source, rid_aug, "medical_dialogue", instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen)
             # Increment count only on success
             count += 1
@@ -205,7 +217,7 @@ def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stat
     logger.info(f"[PROC] {source} done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
-def _proc_pubmedqa_l(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None):
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     count = 0
@@ -236,12 +248,12 @@ def _proc_pubmedqa_l(path, writer, paraphraser, opts, sample_limit, stats, cb, d
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_l", opts, paraphraser, stats)
             _commit_row(writer, "pubmedqa_l", rid, "biomedical_qa", instr, user, out, opts, stats, applied,
-                        extra_meta={"year": v.get("YEAR"), "meshes": v.get("MESHES"), "labels": v.get("LABELS")}, dedupe_seen=dedupe_seen)
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_l", rid_aug, "biomedical_qa",
-                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen)
             # Increment count only on success
             count += 1
@@ -257,7 +269,7 @@ def _proc_pubmedqa_l(path, writer, paraphraser, opts, sample_limit, stats, cb, d
     logger.info(f"[PROC] pubmedqa_l done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
-def _proc_pubmedqa_u(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None):
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     count = 0
@@ -290,12 +302,12 @@ def _proc_pubmedqa_u(path, writer, paraphraser, opts, sample_limit, stats, cb, d
                     out = guess.strip()
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_u", opts, paraphraser, stats)
-            _commit_row(writer, "pubmedqa_u", str(k), "biomedical_qa_unlabeled", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen)
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_u", rid_aug, "biomedical_qa",
-                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen)
             # Increment count only on success
             count += 1
@@ -311,7 +323,7 @@ def _proc_pubmedqa_u(path, writer, paraphraser, opts, sample_limit, stats, cb, d
     logger.info(f"[PROC] pubmedqa_u done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
-def _proc_pubmedqa_map(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None):
     with open(path, "r", encoding="utf-8") as f:
         obj = json.load(f)
@@ -383,14 +395,14 @@ def _proc_pubmedqa_map(path, writer, paraphraser, opts, sample_limit, stats, cb,
             # Process the item
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_map", opts, paraphraser, stats)
-            _commit_row(writer, "pubmedqa_map", rid, "biomedical_qa", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen)
             # Handle expansion if enabled
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_map", rid_aug, "biomedical_qa",
-                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen)
             # Increment count only on success
             count += 1

 from utils.schema import sft_row
 from utils import augment as A
+from vi.processing import translate_sft_row, should_translate, log_translation_stats
 # Logger
 logger = logging.getLogger("processor")
     augment_opts: Dict,
     sample_limit: Optional[int],
     seed: int,
+    progress_cb: Optional[Callable[[float, str], None]],
+    translator=None
 ) -> Tuple[int, Dict]:
     random.seed(seed)
     stats = {
     if key in ("healthcaremagic", "icliniq"):
         count = _proc_med_dialog(source=key, path=input_path, writer=writer,
                                  paraphraser=paraphraser, opts=augment_opts,
+                                 sample_limit=sample_limit, stats=stats, cb=progress_cb, dedupe_seen=dedupe_seen, translator=translator)
     elif key == "pubmedqa_l":
+        count = _proc_pubmedqa_l(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen, translator=translator)
     elif key == "pubmedqa_u":
+        count = _proc_pubmedqa_u(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen, translator=translator)
     elif key == "pubmedqa_map":
+        count = _proc_pubmedqa_map(input_path, writer, paraphraser, augment_opts, sample_limit, stats, progress_cb, dedupe_seen=dedupe_seen, translator=translator)
     else:
         raise ValueError(f"Unknown dataset: {dataset_key}")
     logger.info(f"[PROC] End dataset={dataset_key} stats={stats}")
     return instr, user, out, applied
+def _commit_row(writer, source, rid, task, instr, user, out, opts, stats, aug_applied, extra_meta=None, dedupe_seen=None, translator=None):
     # Dedup entry
     if dedupe_seen is not None:
         fp = A.fingerprint(instr, user, out)
         meta.update(extra_meta)
     row = sft_row(instr, user, out, source=source, rid=rid, task=task, meta=meta)
+    # Apply Vietnamese translation if requested
+    if should_translate(opts.get("vietnamese_translation", False), translator):
+        try:
+            row = translate_sft_row(row, translator)
+            meta["vietnamese_translated"] = True
+            row["meta"] = meta
+        except Exception as e:
+            logger.error(f"Failed to translate SFT row: {e}")
     writer.write(row)
     stats["written"] += 1
     return True
 # ——————————— dataset processors ———————————
+def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None, translator=None):
     count = 0
     written = 0
     for i, obj in enumerate(_iter_json_or_jsonl(path), start=1):
                 applied.append("consistency_flag")
             # 2) If expansion is enabled, add augmented copies
+            _commit_row(writer, source, rid, "medical_dialogue", instr, user, out, opts, stats, ["base"] + applied, dedupe_seen=dedupe_seen, translator=translator)
             # Add augmented copies if expand
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
+                    _commit_row(writer, source, rid_aug, "medical_dialogue", instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
             count += 1
     logger.info(f"[PROC] {source} done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
+def _proc_pubmedqa_l(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None, translator=None):
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     count = 0
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_l", opts, paraphraser, stats)
             _commit_row(writer, "pubmedqa_l", rid, "biomedical_qa", instr, user, out, opts, stats, applied,
+                        extra_meta={"year": v.get("YEAR"), "meshes": v.get("MESHES"), "labels": v.get("LABELS")}, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_l", rid_aug, "biomedical_qa",
+                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
             count += 1
     logger.info(f"[PROC] pubmedqa_l done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
+def _proc_pubmedqa_u(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None, translator=None):
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     count = 0
                     out = guess.strip()
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_u", opts, paraphraser, stats)
+            _commit_row(writer, "pubmedqa_u", str(k), "biomedical_qa_unlabeled", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_u", rid_aug, "biomedical_qa",
+                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
             count += 1
     logger.info(f"[PROC] pubmedqa_u done count={count} written={stats['written']} dedup_skipped={stats['dedup_skipped']}")
     return count
+def _proc_pubmedqa_map(path, writer, paraphraser, opts, sample_limit, stats, cb, dedupe_seen=None, translator=None):
     with open(path, "r", encoding="utf-8") as f:
         obj = json.load(f)
             # Process the item
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_map", opts, paraphraser, stats)
+            _commit_row(writer, "pubmedqa_map", rid, "biomedical_qa", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             # Handle expansion if enabled
             if opts.get("expand", True):
                 for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
                     rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_map", rid_aug, "biomedical_qa",
+                                instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
             count += 1

utils/rag.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Dict, List, Tuple, Optional, Callable
 from utils.schema import sft_row
 from utils.llm import NvidiaClient, KeyRotator
 # Logger
 logger = logging.getLogger("rag_processor")
@@ -165,7 +166,7 @@ class RAGProcessor:
         return ""
     def process_medical_dialog(self, source: str, path: str, writer, sample_limit: Optional[int],
-                             stats: Dict, progress_cb: Optional[Callable], dedupe_seen: set = None) -> int:
         """Process medical dialogue datasets into RAG format"""
         count = 0
         written = 0
@@ -199,7 +200,7 @@ class RAGProcessor:
                 # Commit the RAG-formatted row
                 if self._commit_rag_row(writer, source, rid, "rag_medical_qa",
                                       rag_instruction, rag_user, answer,
-                                      stats, dedupe_seen=dedupe_seen):
                     written += 1
                 count += 1
@@ -220,7 +221,7 @@ class RAGProcessor:
         return count
     def process_pubmedqa(self, source: str, path: str, writer, sample_limit: Optional[int],
-                        stats: Dict, progress_cb: Optional[Callable], dedupe_seen: set = None) -> int:
         """Process PubMedQA datasets into RAG format"""
         with open(path, "r", encoding="utf-8") as f:
             data = json.load(f)
@@ -265,7 +266,7 @@ class RAGProcessor:
                 # Commit the RAG-formatted row
                 if self._commit_rag_row(writer, source, rid, "rag_biomedical_qa",
                                       rag_instruction, rag_user, answer,
-                                      stats, dedupe_seen=dedupe_seen):
                     written += 1
                 count += 1
@@ -287,7 +288,7 @@ class RAGProcessor:
     def _commit_rag_row(self, writer, source: str, rid: str, task: str,
                        instruction: str, user_input: str, output: str,
-                       stats: Dict, dedupe_seen: set = None) -> bool:
         """Commit a RAG-formatted row to the writer"""
         # Simple deduplication based on content hash
         if dedupe_seen is not None:
@@ -299,6 +300,16 @@ class RAGProcessor:
         meta = {"rag_processing": True, "format": "qca"}
         row = sft_row(instruction, user_input, output, source=source, rid=rid, task=task, meta=meta)
         writer.write(row)
         stats["written"] = stats.get("written", 0) + 1
         return True
@@ -310,7 +321,8 @@ def process_file_into_rag(
     nvidia_model: str,
     sample_limit: Optional[int],
     seed: int,
-    progress_cb: Optional[Callable[[float, str], None]]
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)
@@ -326,17 +338,20 @@ def process_file_into_rag(
     dedupe_seen = set()
     key = dataset_key.lower()
     if key in ("healthcaremagic", "icliniq"):
         count = rag_processor.process_medical_dialog(
             source=key, path=input_path, writer=writer,
             sample_limit=sample_limit, stats=stats,
-            progress_cb=progress_cb, dedupe_seen=dedupe_seen
         )
     elif key in ("pubmedqa_l", "pubmedqa_u", "pubmedqa_map"):
         count = rag_processor.process_pubmedqa(
             source=key, path=input_path, writer=writer,
             sample_limit=sample_limit, stats=stats,
-            progress_cb=progress_cb, dedupe_seen=dedupe_seen
         )
     else:
         raise ValueError(f"Unknown dataset for RAG processing: {dataset_key}")

 from utils.schema import sft_row
 from utils.llm import NvidiaClient, KeyRotator
+from vi.processing import translate_rag_row, should_translate, log_translation_stats
 # Logger
 logger = logging.getLogger("rag_processor")
         return ""
     def process_medical_dialog(self, source: str, path: str, writer, sample_limit: Optional[int],
+                             stats: Dict, progress_cb: Optional[Callable], dedupe_seen: set = None, translator=None, opts=None) -> int:
         """Process medical dialogue datasets into RAG format"""
         count = 0
         written = 0
                 # Commit the RAG-formatted row
                 if self._commit_rag_row(writer, source, rid, "rag_medical_qa",
                                       rag_instruction, rag_user, answer,
+                                      stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
                 count += 1
         return count
     def process_pubmedqa(self, source: str, path: str, writer, sample_limit: Optional[int],
+                        stats: Dict, progress_cb: Optional[Callable], dedupe_seen: set = None, translator=None, opts=None) -> int:
         """Process PubMedQA datasets into RAG format"""
         with open(path, "r", encoding="utf-8") as f:
             data = json.load(f)
                 # Commit the RAG-formatted row
                 if self._commit_rag_row(writer, source, rid, "rag_biomedical_qa",
                                       rag_instruction, rag_user, answer,
+                                      stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
                 count += 1
     def _commit_rag_row(self, writer, source: str, rid: str, task: str,
                        instruction: str, user_input: str, output: str,
+                       stats: Dict, dedupe_seen: set = None, translator=None, opts=None) -> bool:
         """Commit a RAG-formatted row to the writer"""
         # Simple deduplication based on content hash
         if dedupe_seen is not None:
         meta = {"rag_processing": True, "format": "qca"}
         row = sft_row(instruction, user_input, output, source=source, rid=rid, task=task, meta=meta)
+        # Apply Vietnamese translation if requested
+        if should_translate(opts.get("vietnamese_translation", False) if opts else False, translator):
+            try:
+                row = translate_rag_row(row, translator)
+                meta["vietnamese_translated"] = True
+                row["meta"] = meta
+            except Exception as e:
+                logger.error(f"Failed to translate RAG row: {e}")
         writer.write(row)
         stats["written"] = stats.get("written", 0) + 1
         return True
     nvidia_model: str,
     sample_limit: Optional[int],
     seed: int,
+    progress_cb: Optional[Callable[[float, str], None]],
+    translator=None
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)
     dedupe_seen = set()
     key = dataset_key.lower()
+    # Create opts with Vietnamese translation flag
+    opts = {"vietnamese_translation": translator is not None}
     if key in ("healthcaremagic", "icliniq"):
         count = rag_processor.process_medical_dialog(
             source=key, path=input_path, writer=writer,
             sample_limit=sample_limit, stats=stats,
+            progress_cb=progress_cb, dedupe_seen=dedupe_seen, translator=translator, opts=opts
         )
     elif key in ("pubmedqa_l", "pubmedqa_u", "pubmedqa_map"):
         count = rag_processor.process_pubmedqa(
             source=key, path=input_path, writer=writer,
             sample_limit=sample_limit, stats=stats,
+            progress_cb=progress_cb, dedupe_seen=dedupe_seen, translator=translator, opts=opts
         )
     else:
         raise ValueError(f"Unknown dataset for RAG processing: {dataset_key}")

vi/README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# Vietnamese Translation Module
+This module provides Vietnamese translation functionality for the MedAI Processing application using the Helsinki-NLP/opus-mt-en-vi model.
+## Features
+- **English to Vietnamese Translation**: Translates English text to Vietnamese using the Helsinki-NLP/opus-mt-en-vi model
+- **Batch Processing**: Efficiently translates multiple texts at once
+- **Dictionary Translation**: Translates specific fields in data dictionaries
+- **Integration**: Seamlessly integrates with both SFT and RAG processing workflows
+- **Error Handling**: Graceful fallback to original text if translation fails
+- **Logging**: Comprehensive logging for debugging and monitoring
+## Configuration
+Add the following environment variable to your `.env` file:
+```bash
+EN_VI=Helsinki-NLP/opus-mt-en-vi
+```
+## Usage
+### Basic Translation
+```python
+from vi.translator import VietnameseTranslator
+# Initialize translator
+translator = VietnameseTranslator()
+# Load the model
+translator.load_model()
+# Translate single text
+translated = translator.translate_text("Hello, how are you?")
+# Translate batch of texts
+texts = ["Text 1", "Text 2", "Text 3"]
+translated_batch = translator.translate_batch(texts)
+```
+### Dictionary Translation
+```python
+# Translate specific fields in a dictionary
+data = {
+    "instruction": "Answer the question",
+    "input": "What is diabetes?",
+    "output": "Diabetes is a metabolic disorder..."
+}
+translated_data = translator.translate_dict(data, ["instruction", "input", "output"])
+```
+## Integration
+The translation functionality is automatically integrated into the processing workflows:
+1. **UI Toggle**: Users can enable Vietnamese translation via the checkbox in the web interface
+2. **SFT Processing**: All text fields in SFT format are translated when enabled
+3. **RAG Processing**: All text fields in RAG format are translated when enabled
+4. **Metadata**: Translated rows are marked with `vietnamese_translated: true` in metadata
+## Model Information
+- **Model**: Helsinki-NLP/opus-mt-en-vi
+- **Source Language**: English
+- **Target Language**: Vietnamese
+- **BLEU Score**: 37.2
+- **chrF Score**: 0.542
+- **License**: Apache 2.0
+## Testing
+Run the test script to verify translation functionality:
+```bash
+python test_translation.py
+```
+## Files
+- `translator.py`: Main translation class
+- `download.py`: Model download script for Docker
+- `processing_utils.py`: Utility functions for processing integration
+- `__init__.py`: Module initialization
+- `README.md`: This documentation
+## Notes
+- The model is automatically downloaded during Docker build
+- Translation is performed on the CPU by default, but can use GPU if available
+- The model requires the target language token `>>vie<<` for proper translation
+- All translation operations include comprehensive error handling and logging

vi/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Vietnamese Translation Module
+This module provides utilities for translating English text to Vietnamese
+using the Helsinki-NLP/opus-mt-en-vi model from Hugging Face.
+"""
+from .translator import VietnameseTranslator
+__all__ = ['VietnameseTranslator']

vi/download.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Model Download Script for Vietnamese Translation
+This script downloads the Helsinki-NLP/opus-mt-en-vi model
+and saves it to the Hugging Face cache directory.
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+from transformers import MarianMTModel, MarianTokenizer
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def download_model(model_name: str = "Helsinki-NLP/opus-mt-en-vi", cache_dir: str = None):
+    """
+    Download the translation model and tokenizer.
+    Args:
+        model_name: Hugging Face model name
+        cache_dir: Cache directory for the model. If None, uses HF_HOME env var
+    """
+    if cache_dir is None:
+        cache_dir = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+    logger.info(f"Downloading model: {model_name}")
+    logger.info(f"Cache directory: {cache_dir}")
+    try:
+        # Ensure cache directory exists
+        os.makedirs(cache_dir, exist_ok=True)
+        # Download tokenizer
+        logger.info("Downloading tokenizer...")
+        tokenizer = MarianTokenizer.from_pretrained(
+            model_name,
+            cache_dir=cache_dir
+        )
+        logger.info("✅ Tokenizer downloaded successfully")
+        # Download model
+        logger.info("Downloading model...")
+        model = MarianMTModel.from_pretrained(
+            model_name,
+            cache_dir=cache_dir
+        )
+        logger.info("✅ Model downloaded successfully")
+        # Test the model
+        logger.info("Testing model...")
+        test_text = "Hello, how are you?"
+        inputs = tokenizer(f">>vie<< {test_text}", return_tensors="pt")
+        with model.eval():
+            outputs = model.generate(**inputs, max_length=50, num_beams=4)
+        translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        logger.info(f"Test translation: '{test_text}' -> '{translated}'")
+        logger.info("🎉 Model download and test completed successfully!")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Failed to download model: {e}")
+        return False
+def main():
+    """Main function to download the model."""
+    # Get model name from environment variable or use default
+    model_name = os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
+    logger.info("Starting model download process...")
+    logger.info(f"Model: {model_name}")
+    success = download_model(model_name)
+    if success:
+        logger.info("Model download completed successfully!")
+        sys.exit(0)
+    else:
+        logger.error("Model download failed!")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

vi/processing.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Processing utilities for Vietnamese translation integration
+"""
+import logging
+from typing import Dict, Any, List, Optional, Callable
+logger = logging.getLogger(__name__)
+def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
+    """
+    Translate specific text fields in an SFT row from English to Vietnamese.
+    Args:
+        row: SFT row dictionary
+        translator: VietnameseTranslator instance
+        text_fields: List of field names to translate. If None, uses default fields.
+    Returns:
+        Translated SFT row dictionary
+    """
+    if not translator or not translator.is_loaded():
+        logger.warning("Translator not available, skipping translation")
+        return row
+    if text_fields is None:
+        # Default fields to translate in SFT format
+        text_fields = ["instruction", "input", "output"]
+    try:
+        translated_row = translator.translate_dict(row, text_fields)
+        logger.debug(f"Translated SFT row with fields: {text_fields}")
+        return translated_row
+    except Exception as e:
+        logger.error(f"Failed to translate SFT row: {e}")
+        return row
+def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
+    """
+    Translate specific text fields in a RAG row from English to Vietnamese.
+    Args:
+        row: RAG row dictionary
+        translator: VietnameseTranslator instance
+        text_fields: List of field names to translate. If None, uses default fields.
+    Returns:
+        Translated RAG row dictionary
+    """
+    if not translator or not translator.is_loaded():
+        logger.warning("Translator not available, skipping translation")
+        return row
+    if text_fields is None:
+        # Default fields to translate in RAG format
+        text_fields = ["instruction", "input", "output"]
+    try:
+        translated_row = translator.translate_dict(row, text_fields)
+        logger.debug(f"Translated RAG row with fields: {text_fields}")
+        return translated_row
+    except Exception as e:
+        logger.error(f"Failed to translate RAG row: {e}")
+        return row
+def should_translate(vietnamese_translation: bool, translator) -> bool:
+    """
+    Check if translation should be performed.
+    Args:
+        vietnamese_translation: Flag from user input
+        translator: VietnameseTranslator instance
+    Returns:
+        True if translation should be performed
+    """
+    if not vietnamese_translation:
+        return False
+    if not translator or not translator.is_loaded():
+        logger.warning("Vietnamese translation requested but translator not available")
+        return False
+    return True
+def log_translation_stats(stats: Dict[str, Any], translated_count: int) -> None:
+    """
+    Log translation statistics.
+    Args:
+        stats: Statistics dictionary to update
+        translated_count: Number of items translated
+    """
+    stats["vietnamese_translated"] = translated_count
+    logger.info(f"Vietnamese translation completed: {translated_count} items translated")

vi/translator.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Vietnamese Translator using Helsinki-NLP/opus-mt-en-vi model
+"""
+import os
+import logging
+from typing import List, Dict, Any, Optional, Union
+from transformers import MarianMTModel, MarianTokenizer
+import torch
+logger = logging.getLogger(__name__)
+class VietnameseTranslator:
+    """
+    Vietnamese translator using Helsinki-NLP/opus-mt-en-vi model.
+    This class handles translation from English to Vietnamese using the
+    MarianMT model from Hugging Face Transformers.
+    """
+    def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None):
+        """
+        Initialize the Vietnamese translator.
+        Args:
+            model_name: Hugging Face model name. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
+            device: Device to run the model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
+        """
+        self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
+        self.device = self._get_device(device)
+        self.model = None
+        self.tokenizer = None
+        self._is_loaded = False
+        logger.info(f"VietnameseTranslator initialized with model: {self.model_name}")
+        logger.info(f"Using device: {self.device}")
+    def _get_device(self, device: Optional[str]) -> str:
+        """Determine the best device to use for the model."""
+        if device:
+            return device
+        if torch.cuda.is_available():
+            return "cuda"
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    def load_model(self) -> None:
+        """Load the translation model and tokenizer."""
+        if self._is_loaded:
+            logger.debug("Model already loaded, skipping...")
+            return
+        try:
+            logger.info(f"Loading translation model: {self.model_name}")
+            logger.info(f"Loading on device: {self.device}")
+            # Load tokenizer
+            self.tokenizer = MarianTokenizer.from_pretrained(
+                self.model_name,
+                cache_dir=os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+            )
+            # Load model
+            self.model = MarianMTModel.from_pretrained(
+                self.model_name,
+                cache_dir=os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+            )
+            # Move model to device
+            self.model = self.model.to(self.device)
+            self.model.eval()
+            self._is_loaded = True
+            logger.info("✅ Translation model loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to load translation model: {e}")
+            raise RuntimeError(f"Failed to load translation model: {e}")
+    def translate_text(self, text: str) -> str:
+        """
+        Translate a single text from English to Vietnamese.
+        Args:
+            text: English text to translate
+        Returns:
+            Translated Vietnamese text
+        """
+        if not self._is_loaded:
+            self.load_model()
+        if not text or not text.strip():
+            return text
+        try:
+            # Prepare input with target language token
+            # The model requires a target language token in the format >>id<<
+            input_text = f">>vie<< {text.strip()}"
+            # Tokenize
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            ).to(self.device)
+            # Translate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=512,
+                    num_beams=4,
+                    early_stopping=True,
+                    do_sample=False
+                )
+            # Decode
+            translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            logger.debug(f"Translated: '{text[:50]}...' -> '{translated[:50]}...'")
+            return translated.strip()
+        except Exception as e:
+            logger.error(f"Translation failed for text: '{text[:100]}...' - Error: {e}")
+            # Return original text if translation fails
+            return text
+    def translate_batch(self, texts: List[str], batch_size: int = 8) -> List[str]:
+        """
+        Translate a batch of texts from English to Vietnamese.
+        Args:
+            texts: List of English texts to translate
+            batch_size: Number of texts to process in each batch
+        Returns:
+            List of translated Vietnamese texts
+        """
+        if not self._is_loaded:
+            self.load_model()
+        if not texts:
+            return []
+        results = []
+        try:
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i:i + batch_size]
+                logger.debug(f"Processing batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}")
+                # Prepare batch with target language tokens
+                batch_inputs = [f">>vie<< {text.strip()}" for text in batch]
+                # Tokenize batch
+                inputs = self.tokenizer(
+                    batch_inputs,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=512
+                ).to(self.device)
+                # Translate batch
+                with torch.no_grad():
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_length=512,
+                        num_beams=4,
+                        early_stopping=True,
+                        do_sample=False
+                    )
+                # Decode batch
+                batch_translations = [
+                    self.tokenizer.decode(output, skip_special_tokens=True).strip()
+                    for output in outputs
+                ]
+                results.extend(batch_translations)
+        except Exception as e:
+            logger.error(f"Batch translation failed: {e}")
+            # Return original texts if translation fails
+            results = texts
+        logger.info(f"Translated {len(texts)} texts successfully")
+        return results
+    def translate_dict(self, data: Dict[str, Any], text_fields: List[str]) -> Dict[str, Any]:
+        """
+        Translate specific text fields in a dictionary from English to Vietnamese.
+        Args:
+            data: Dictionary containing the data
+            text_fields: List of field names to translate
+        Returns:
+            Dictionary with translated text fields
+        """
+        if not self._is_loaded:
+            self.load_model()
+        result = data.copy()
+        for field in text_fields:
+            if field in data and isinstance(data[field], str) and data[field].strip():
+                try:
+                    result[field] = self.translate_text(data[field])
+                    logger.debug(f"Translated field '{field}': '{data[field][:50]}...' -> '{result[field][:50]}...'")
+                except Exception as e:
+                    logger.error(f"Failed to translate field '{field}': {e}")
+                    # Keep original text if translation fails
+                    result[field] = data[field]
+        return result
+    def translate_list_of_dicts(self, data_list: List[Dict[str, Any]], text_fields: List[str]) -> List[Dict[str, Any]]:
+        """
+        Translate specific text fields in a list of dictionaries.
+        Args:
+            data_list: List of dictionaries containing the data
+            text_fields: List of field names to translate in each dictionary
+        Returns:
+            List of dictionaries with translated text fields
+        """
+        if not data_list:
+            return []
+        logger.info(f"Translating {len(data_list)} items with fields: {text_fields}")
+        results = []
+        for i, data in enumerate(data_list):
+            try:
+                translated_data = self.translate_dict(data, text_fields)
+                results.append(translated_data)
+                if (i + 1) % 100 == 0:
+                    logger.info(f"Translated {i + 1}/{len(data_list)} items")
+            except Exception as e:
+                logger.error(f"Failed to translate item {i}: {e}")
+                results.append(data)  # Keep original data if translation fails
+        logger.info(f"Completed translation of {len(data_list)} items")
+        return results
+    def is_loaded(self) -> bool:
+        """Check if the model is loaded."""
+        return self._is_loaded
+    def get_model_info(self) -> Dict[str, str]:
+        """Get information about the loaded model."""
+        return {
+            "model_name": self.model_name,
+            "device": self.device,
+            "is_loaded": self._is_loaded
+        }