LiamKhoaLe commited on
Commit
62d99f6
·
1 Parent(s): f359dc2

Upd vi trans

Browse files
Files changed (2) hide show
  1. .gitignore +4 -1
  2. vi/processing.py +54 -1
.gitignore CHANGED
@@ -1,4 +1,7 @@
1
  .env
2
  client1.json
3
  client2.json
4
- medai.json
 
 
 
 
1
  .env
2
  client1.json
3
  client2.json
4
+ medai.json
5
+
6
+ cache
7
+ __pycache__
vi/processing.py CHANGED
@@ -31,6 +31,31 @@ def _vi_sanitize_text(s: str) -> str:
31
  t = " ".join(filtered)
32
  return t
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def _validate_vi_translation(original: str, translated: str) -> bool:
35
  """Validate Vietnamese translation quality"""
36
  if not translated or not isinstance(translated, str):
@@ -40,8 +65,16 @@ def _validate_vi_translation(original: str, translated: str) -> bool:
40
  if len(translated.strip()) < 3:
41
  return False
42
 
43
- # If translation is identical to original, it's not a valid translation
44
  if translated.strip() == original.strip():
 
 
 
 
 
 
 
 
45
  return False
46
 
47
  # Check for common translation failure patterns
@@ -102,6 +135,16 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
102
  if field in sft_data and isinstance(sft_data[field], str) and sft_data[field].strip():
103
  try:
104
  original = sft_data[field]
 
 
 
 
 
 
 
 
 
 
105
  translated = translator.translate_text(original)
106
 
107
  # Debug logging
@@ -170,6 +213,16 @@ def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] =
170
  if field in row and isinstance(row[field], str) and row[field].strip():
171
  try:
172
  original = row[field]
 
 
 
 
 
 
 
 
 
 
173
  translated = translator.translate_text(original)
174
 
175
  # Debug logging
 
31
  t = " ".join(filtered)
32
  return t
33
 
34
+ def _is_vietnamese_text(text: str) -> bool:
35
+ """Check if text is already in Vietnamese"""
36
+ if not text or not isinstance(text, str):
37
+ return False
38
+
39
+ import re
40
+ # Check for Vietnamese characters
41
+ vietnamese_chars = len(re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', text, re.IGNORECASE))
42
+ total_chars = len(re.sub(r'\s', '', text))
43
+
44
+ # If more than 20% of characters are Vietnamese, consider it Vietnamese text
45
+ if total_chars > 0 and vietnamese_chars / total_chars > 0.2:
46
+ return True
47
+
48
+ # Check for common Vietnamese words (including single words)
49
+ vietnamese_words = ['chào', 'xin chào', 'cảm ơn', 'tôi', 'bạn', 'là', 'có', 'không', 'và', 'của', 'trong', 'với', 'để', 'cho', 'về', 'từ', 'đến', 'tại', 'này', 'đó', 'đây', 'kia', 'nào', 'sao', 'thế', 'nào', 'gì', 'ai', 'đâu', 'khi', 'nếu', 'mà', 'để', 'cho', 'về', 'từ', 'đến', 'tại', 'triệu', 'chứng', 'bệnh', 'tiểu', 'đường', 'bác', 'sĩ', 'bệnh', 'nhân']
50
+ text_lower = text.lower()
51
+ vietnamese_word_count = sum(1 for word in vietnamese_words if word in text_lower)
52
+
53
+ # If text contains any Vietnamese words, consider it Vietnamese
54
+ if vietnamese_word_count >= 1:
55
+ return True
56
+
57
+ return False
58
+
59
  def _validate_vi_translation(original: str, translated: str) -> bool:
60
  """Validate Vietnamese translation quality"""
61
  if not translated or not isinstance(translated, str):
 
65
  if len(translated.strip()) < 3:
66
  return False
67
 
68
+ # If translation is identical to original, check if original was already Vietnamese
69
  if translated.strip() == original.strip():
70
+ # If original was already Vietnamese, this is actually a valid case
71
+ if _is_vietnamese_text(original):
72
+ return True
73
+ # Otherwise, it's not a valid translation
74
+ return False
75
+
76
+ # Check if original was Vietnamese but translated is English (wrong direction)
77
+ if _is_vietnamese_text(original) and not _is_vietnamese_text(translated):
78
  return False
79
 
80
  # Check for common translation failure patterns
 
135
  if field in sft_data and isinstance(sft_data[field], str) and sft_data[field].strip():
136
  try:
137
  original = sft_data[field]
138
+
139
+ # Check if text is already in Vietnamese - skip translation if so
140
+ if _is_vietnamese_text(original):
141
+ logger.debug(f"Field '{field}' is already in Vietnamese, skipping translation")
142
+ translated_sft[field] = original
143
+ # Add success statistics (no translation needed)
144
+ if hasattr(translator, '_stats'):
145
+ add_translation_stats(translator._stats, f"sft_{field}", True)
146
+ continue
147
+
148
  translated = translator.translate_text(original)
149
 
150
  # Debug logging
 
213
  if field in row and isinstance(row[field], str) and row[field].strip():
214
  try:
215
  original = row[field]
216
+
217
+ # Check if text is already in Vietnamese - skip translation if so
218
+ if _is_vietnamese_text(original):
219
+ logger.debug(f"RAG Field '{field}' is already in Vietnamese, skipping translation")
220
+ translated_row[field] = original
221
+ # Add success statistics (no translation needed)
222
+ if hasattr(translator, '_stats'):
223
+ add_translation_stats(translator._stats, f"rag_{field}", True)
224
+ continue
225
+
226
  translated = translator.translate_text(original)
227
 
228
  # Debug logging