Spaces:
Sleeping
Sleeping
Commit
·
62d99f6
1
Parent(s):
f359dc2
Upd vi trans
Browse files- .gitignore +4 -1
- vi/processing.py +54 -1
.gitignore
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
.env
|
| 2 |
client1.json
|
| 3 |
client2.json
|
| 4 |
-
medai.json
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.env
|
| 2 |
client1.json
|
| 3 |
client2.json
|
| 4 |
+
medai.json
|
| 5 |
+
|
| 6 |
+
cache
|
| 7 |
+
__pycache__
|
vi/processing.py
CHANGED
|
@@ -31,6 +31,31 @@ def _vi_sanitize_text(s: str) -> str:
|
|
| 31 |
t = " ".join(filtered)
|
| 32 |
return t
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def _validate_vi_translation(original: str, translated: str) -> bool:
|
| 35 |
"""Validate Vietnamese translation quality"""
|
| 36 |
if not translated or not isinstance(translated, str):
|
|
@@ -40,8 +65,16 @@ def _validate_vi_translation(original: str, translated: str) -> bool:
|
|
| 40 |
if len(translated.strip()) < 3:
|
| 41 |
return False
|
| 42 |
|
| 43 |
-
# If translation is identical to original,
|
| 44 |
if translated.strip() == original.strip():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
return False
|
| 46 |
|
| 47 |
# Check for common translation failure patterns
|
|
@@ -102,6 +135,16 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
|
|
| 102 |
if field in sft_data and isinstance(sft_data[field], str) and sft_data[field].strip():
|
| 103 |
try:
|
| 104 |
original = sft_data[field]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
translated = translator.translate_text(original)
|
| 106 |
|
| 107 |
# Debug logging
|
|
@@ -170,6 +213,16 @@ def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] =
|
|
| 170 |
if field in row and isinstance(row[field], str) and row[field].strip():
|
| 171 |
try:
|
| 172 |
original = row[field]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
translated = translator.translate_text(original)
|
| 174 |
|
| 175 |
# Debug logging
|
|
|
|
| 31 |
t = " ".join(filtered)
|
| 32 |
return t
|
| 33 |
|
| 34 |
+
def _is_vietnamese_text(text: str) -> bool:
|
| 35 |
+
"""Check if text is already in Vietnamese"""
|
| 36 |
+
if not text or not isinstance(text, str):
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
import re
|
| 40 |
+
# Check for Vietnamese characters
|
| 41 |
+
vietnamese_chars = len(re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', text, re.IGNORECASE))
|
| 42 |
+
total_chars = len(re.sub(r'\s', '', text))
|
| 43 |
+
|
| 44 |
+
# If more than 20% of characters are Vietnamese, consider it Vietnamese text
|
| 45 |
+
if total_chars > 0 and vietnamese_chars / total_chars > 0.2:
|
| 46 |
+
return True
|
| 47 |
+
|
| 48 |
+
# Check for common Vietnamese words (including single words)
|
| 49 |
+
vietnamese_words = ['chào', 'xin chào', 'cảm ơn', 'tôi', 'bạn', 'là', 'có', 'không', 'và', 'của', 'trong', 'với', 'để', 'cho', 'về', 'từ', 'đến', 'tại', 'này', 'đó', 'đây', 'kia', 'nào', 'sao', 'thế', 'nào', 'gì', 'ai', 'đâu', 'khi', 'nếu', 'mà', 'để', 'cho', 'về', 'từ', 'đến', 'tại', 'triệu', 'chứng', 'bệnh', 'tiểu', 'đường', 'bác', 'sĩ', 'bệnh', 'nhân']
|
| 50 |
+
text_lower = text.lower()
|
| 51 |
+
vietnamese_word_count = sum(1 for word in vietnamese_words if word in text_lower)
|
| 52 |
+
|
| 53 |
+
# If text contains any Vietnamese words, consider it Vietnamese
|
| 54 |
+
if vietnamese_word_count >= 1:
|
| 55 |
+
return True
|
| 56 |
+
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
def _validate_vi_translation(original: str, translated: str) -> bool:
|
| 60 |
"""Validate Vietnamese translation quality"""
|
| 61 |
if not translated or not isinstance(translated, str):
|
|
|
|
| 65 |
if len(translated.strip()) < 3:
|
| 66 |
return False
|
| 67 |
|
| 68 |
+
# If translation is identical to original, check if original was already Vietnamese
|
| 69 |
if translated.strip() == original.strip():
|
| 70 |
+
# If original was already Vietnamese, this is actually a valid case
|
| 71 |
+
if _is_vietnamese_text(original):
|
| 72 |
+
return True
|
| 73 |
+
# Otherwise, it's not a valid translation
|
| 74 |
+
return False
|
| 75 |
+
|
| 76 |
+
# Check if original was Vietnamese but translated is English (wrong direction)
|
| 77 |
+
if _is_vietnamese_text(original) and not _is_vietnamese_text(translated):
|
| 78 |
return False
|
| 79 |
|
| 80 |
# Check for common translation failure patterns
|
|
|
|
| 135 |
if field in sft_data and isinstance(sft_data[field], str) and sft_data[field].strip():
|
| 136 |
try:
|
| 137 |
original = sft_data[field]
|
| 138 |
+
|
| 139 |
+
# Check if text is already in Vietnamese - skip translation if so
|
| 140 |
+
if _is_vietnamese_text(original):
|
| 141 |
+
logger.debug(f"Field '{field}' is already in Vietnamese, skipping translation")
|
| 142 |
+
translated_sft[field] = original
|
| 143 |
+
# Add success statistics (no translation needed)
|
| 144 |
+
if hasattr(translator, '_stats'):
|
| 145 |
+
add_translation_stats(translator._stats, f"sft_{field}", True)
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
translated = translator.translate_text(original)
|
| 149 |
|
| 150 |
# Debug logging
|
|
|
|
| 213 |
if field in row and isinstance(row[field], str) and row[field].strip():
|
| 214 |
try:
|
| 215 |
original = row[field]
|
| 216 |
+
|
| 217 |
+
# Check if text is already in Vietnamese - skip translation if so
|
| 218 |
+
if _is_vietnamese_text(original):
|
| 219 |
+
logger.debug(f"RAG Field '{field}' is already in Vietnamese, skipping translation")
|
| 220 |
+
translated_row[field] = original
|
| 221 |
+
# Add success statistics (no translation needed)
|
| 222 |
+
if hasattr(translator, '_stats'):
|
| 223 |
+
add_translation_stats(translator._stats, f"rag_{field}", True)
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
translated = translator.translate_text(original)
|
| 227 |
|
| 228 |
# Debug logging
|