Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 4

Commit

4056c2c

1 Parent(s): 99c49c6

Upd caching + trans SFT saver

Browse files

Files changed (5) hide show

scritps/cache_test.py +118 -0
trans_test.py → scritps/trans_test.py +0 -0
utils/datasets.py +11 -1
vi/processing.py +50 -20
vi/translator.py +6 -2

scritps/cache_test.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the fixes for HF permissions and Vietnamese translation
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+# Add the project root to Python path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+from vi.translator import VietnameseTranslator
+from vi.processing import translate_sft_row, _validate_vi_translation
+from utils.schema import sft_row
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def test_vietnamese_translation():
+    """Test Vietnamese translation functionality"""
+    logger.info("Testing Vietnamese translation...")
+    # Create a sample SFT row
+    sample_row = sft_row(
+        instruction="Answer the patient's question like a clinician. Be concise and safe.",
+        user_input="What are the symptoms of diabetes?",
+        output="Common symptoms of diabetes include increased thirst, frequent urination, unexplained weight loss, fatigue, and blurred vision. If you experience these symptoms, please consult a healthcare provider.",
+        source="test",
+        rid="test_001",
+        task="medical_dialogue"
+    )
+    logger.info(f"Original SFT row: {sample_row}")
+    # Test translation validation
+    test_cases = [
+        ("Hello world", "Xin chào thế giới", True),  # Valid Vietnamese
+        ("Hello world", "Hello world", False),  # Same as original (not translated)
+        ("Hello world", "translation error", False),  # Contains error keyword
+        ("Hello world", "Hi", False),  # Too short
+        ("Hello world", "", False),  # Empty
+    ]
+    logger.info("Testing translation validation...")
+    for original, translated, expected in test_cases:
+        result = _validate_vi_translation(original, translated)
+        status = "✅" if result == expected else "❌"
+        logger.info(f"{status} {original} -> {translated}: {result} (expected {expected})")
+    # Test with translator (if available)
+    try:
+        translator = VietnameseTranslator()
+        logger.info("Vietnamese translator initialized successfully")
+        # Try to load the model
+        try:
+            translator.load_model()
+            logger.info("✅ Translation model loaded successfully")
+        except Exception as e:
+            logger.warning(f"Could not load translation model: {e}")
+            logger.info("This is expected if the model is not downloaded yet")
+            return
+        # Test translation
+        translated_row = translate_sft_row(sample_row, translator)
+        logger.info(f"Translated SFT row: {translated_row}")
+        # Check if translation was applied
+        original_sft = sample_row["sft"]
+        translated_sft = translated_row["sft"]
+        for field in ["instruction", "input", "output"]:
+            original_text = original_sft[field]
+            translated_text = translated_sft[field]
+            if original_text != translated_text:
+                logger.info(f"✅ Field '{field}' was translated")
+                logger.info(f"  Original: {original_text[:100]}...")
+                logger.info(f"  Translated: {translated_text[:100]}...")
+            else:
+                logger.info(f"⚠️ Field '{field}' was not translated (may be due to validation failure)")
+    except Exception as e:
+        logger.warning(f"Could not test with actual translator: {e}")
+        logger.info("This is expected if the model is not downloaded yet")
+def test_hf_cache_setup():
+    """Test Hugging Face cache directory setup"""
+    logger.info("Testing HF cache setup...")
+    # Test cache directory creation
+    cache_dir = os.path.abspath("cache/huggingface")
+    os.makedirs(cache_dir, exist_ok=True)
+    if os.path.exists(cache_dir) and os.access(cache_dir, os.R_OK | os.W_OK):
+        logger.info(f"✅ Cache directory {cache_dir} is accessible")
+    else:
+        logger.error(f"❌ Cache directory {cache_dir} is not accessible")
+    # Test HF_HOME environment variable
+    os.environ["HF_HOME"] = cache_dir
+    hf_home = os.getenv("HF_HOME")
+    if hf_home == cache_dir:
+        logger.info(f"✅ HF_HOME environment variable set to {hf_home}")
+    else:
+        logger.error(f"❌ HF_HOME environment variable not set correctly")
+if __name__ == "__main__":
+    logger.info("Starting fix verification tests...")
+    test_hf_cache_setup()
+    test_vietnamese_translation()
+    logger.info("Tests completed!")

trans_test.py → scritps/trans_test.py RENAMED Viewed

File without changes

utils/datasets.py CHANGED Viewed

@@ -49,12 +49,22 @@ def hf_download_dataset(repo_id: str, filename: str, repo_type: str = "dataset")
     logger.info(
         f"[HF] Download {repo_id}/{filename} (type={repo_type}) token={'yes' if token else 'no'}"
     )
     path = hf_hub_download(
         repo_id=repo_id,
         filename=filename,
         repo_type=repo_type,
         token=token,
-        local_dir=os.path.abspath("cache/hf"),
         local_dir_use_symlinks=False
     )
     try:

     logger.info(
         f"[HF] Download {repo_id}/{filename} (type={repo_type}) token={'yes' if token else 'no'}"
     )
+    # Set cache directory with proper permissions
+    cache_dir = os.path.abspath("cache/hf")
+    os.makedirs(cache_dir, exist_ok=True)
+    # Set HF_HOME to avoid permission issues
+    hf_home = os.path.abspath("cache/huggingface")
+    os.makedirs(hf_home, exist_ok=True)
+    os.environ["HF_HOME"] = hf_home
     path = hf_hub_download(
         repo_id=repo_id,
         filename=filename,
         repo_type=repo_type,
         token=token,
+        local_dir=cache_dir,
         local_dir_use_symlinks=False
     )
     try:

vi/processing.py CHANGED Viewed

@@ -36,28 +36,39 @@ def _validate_vi_translation(original: str, translated: str) -> bool:
     if not translated or not isinstance(translated, str):
         return False
-    # Check if translation is too short or too different in length
     if len(translated.strip()) < 3:
         return False
-    # Check if translation contains too much English (should be mostly Vietnamese)
-    import re
-    english_chars = len(re.findall(r'[a-zA-Z]', translated))
-    total_chars = len(re.sub(r'\s', '', translated))
-    if total_chars > 0 and english_chars / total_chars > 0.7:
         return False
     # Check for common translation failure patterns
     failure_patterns = [
-        "translation", "error", "failed", "unable", "cannot",
-        "not available", "not found", "invalid", "error"
     ]
     translated_lower = translated.lower()
     for pattern in failure_patterns:
         if pattern in translated_lower:
             return False
-    return True
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
@@ -80,17 +91,36 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
         text_fields = ["instruction", "input", "output"]
     try:
-        translated_row = translator.translate_dict(row, text_fields)
-        # Validate and sanitize translated fields
-        for f in text_fields:
-            if f in translated_row.get("sft", {}):
-                original = row.get("sft", {}).get(f, "")
-                translated = translated_row["sft"][f]
-                if _validate_vi_translation(original, translated):
-                    translated_row["sft"][f] = _vi_sanitize_text(translated)
-                else:
-                    logger.warning(f"Invalid Vietnamese translation for field {f}, keeping original")
-                    translated_row["sft"][f] = original
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:

     if not translated or not isinstance(translated, str):
         return False
+    # Check if translation is too short
     if len(translated.strip()) < 3:
         return False
+    # If translation is identical to original, it's not a valid translation
+    if translated.strip() == original.strip():
         return False
     # Check for common translation failure patterns
     failure_patterns = [
+        "translation error", "translation failed", "unable to translate",
+        "cannot translate", "not available", "not found", "invalid translation"
     ]
     translated_lower = translated.lower()
     for pattern in failure_patterns:
         if pattern in translated_lower:
             return False
+    # Check if translation contains Vietnamese characters (basic check)
+    import re
+    vietnamese_chars = len(re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', translated, re.IGNORECASE))
+    total_chars = len(re.sub(r'\s', '', translated))
+    # If there are Vietnamese characters, it's likely a valid translation
+    if vietnamese_chars > 0:
+        return True
+    # If no Vietnamese characters but significantly different from original, accept it
+    # (some translations might not have Vietnamese diacritics)
+    if len(translated) > len(original) * 0.5 and len(translated) < len(original) * 2.0:
+        return True
+    return False
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
         text_fields = ["instruction", "input", "output"]
     try:
+        # Create a copy of the row to avoid modifying the original
+        translated_row = row.copy()
+        # Translate the SFT fields directly
+        sft_data = row.get("sft", {})
+        translated_sft = {}
+        for field in text_fields:
+            if field in sft_data and isinstance(sft_data[field], str) and sft_data[field].strip():
+                try:
+                    original = sft_data[field]
+                    translated = translator.translate_text(original)
+                    # Validate and sanitize translated field
+                    if _validate_vi_translation(original, translated):
+                        translated_sft[field] = _vi_sanitize_text(translated)
+                        logger.debug(f"Translated field '{field}': '{original[:50]}...' -> '{translated[:50]}...'")
+                    else:
+                        logger.warning(f"Invalid Vietnamese translation for field {field}, keeping original")
+                        translated_sft[field] = original
+                except Exception as e:
+                    logger.error(f"Failed to translate field '{field}': {e}")
+                    translated_sft[field] = sft_data[field]
+            else:
+                # Keep original if field doesn't exist or is empty
+                translated_sft[field] = sft_data.get(field, "")
+        # Update the translated row
+        translated_row["sft"] = translated_sft
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:

vi/translator.py CHANGED Viewed

@@ -57,16 +57,20 @@ class VietnameseTranslator:
             logger.info(f"Loading translation model: {self.model_name}")
             logger.info(f"Loading on device: {self.device}")
             # Load tokenizer
             self.tokenizer = MarianTokenizer.from_pretrained(
                 self.model_name,
-                cache_dir=os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
             )
             # Load model
             self.model = MarianMTModel.from_pretrained(
                 self.model_name,
-                cache_dir=os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
             )
             # Move model to device

             logger.info(f"Loading translation model: {self.model_name}")
             logger.info(f"Loading on device: {self.device}")
+            # Set up cache directory
+            cache_dir = os.getenv("HF_HOME", os.path.abspath("cache/huggingface"))
+            os.makedirs(cache_dir, exist_ok=True)
             # Load tokenizer
             self.tokenizer = MarianTokenizer.from_pretrained(
                 self.model_name,
+                cache_dir=cache_dir
             )
             # Load model
             self.model = MarianMTModel.from_pretrained(
                 self.model_name,
+                cache_dir=cache_dir
             )
             # Move model to device