Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 4

Commit

5dcfc82

1 Parent(s): a7fd3ba

Enrich augmentation with different QA variants. Ensure Vnmese output, add graceful fallback

Browse files

Files changed (5) hide show

app.py +2 -1
utils/augment.py +49 -0
utils/processor.py +120 -10
utils/rag.py +24 -5
vi/processing.py +44 -4

app.py CHANGED Viewed

@@ -408,7 +408,8 @@ def _run_job(dataset_key: str, params: ProcessParams):
                 sample_limit=params.sample_limit,
                 seed=params.seed,
                 progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
-                translator=translator
             )
         else:
             # Standard SFT processing mode

                 sample_limit=params.sample_limit,
                 seed=params.seed,
                 progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
+                translator=translator,
+                paraphraser=paraphraser
             )
         else:
             # Standard SFT processing mode

utils/augment.py CHANGED Viewed

@@ -118,3 +118,52 @@ def consistency_ok(user: str, out: str, ratio: float, paraphraser) -> bool:
     if random.random() >= ratio:
         return True
     return paraphraser.consistency_check(user, out)

     if random.random() >= ratio:
         return True
     return paraphraser.consistency_check(user, out)
+def is_invalid_response(text: str) -> bool:
+    """Check if model response is invalid (Fail, Invalid, etc.)"""
+    if not text or not isinstance(text, str):
+        return True
+    text_lower = text.lower().strip()
+    invalid_patterns = [
+        "fail", "invalid", "i couldn't", "i can't", "i cannot", "unable to",
+        "sorry", "error", "not available", "no answer", "insufficient",
+        "don't know", "do not know", "not sure", "cannot determine",
+        "unable to provide", "not possible", "not applicable", "n/a"
+    ]
+    # Check if response is too short or matches invalid patterns
+    if len(text_lower) < 3:
+        return True
+    for pattern in invalid_patterns:
+        if pattern in text_lower:
+            return True
+    return False
+def clean_invalid_response(text: str, fallback: str = "") -> str:
+    """Clean invalid responses by returning fallback or empty string"""
+    if is_invalid_response(text):
+        return fallback
+    return text
+def retry_invalid_response(text: str, paraphraser, max_retries: int = 3) -> str:
+    """Retry generating valid response for invalid text, max 3 retries"""
+    if not is_invalid_response(text):
+        return text
+    for attempt in range(max_retries):
+        try:
+            # Try paraphrasing with different difficulty levels
+            difficulty = "easy" if attempt == 0 else "hard" if attempt == 1 else "easy"
+            retry_text = paraphraser.paraphrase(text, difficulty=difficulty)
+            if retry_text and not is_invalid_response(retry_text):
+                return retry_text
+        except Exception as e:
+            logger.warning(f"Retry attempt {attempt + 1} failed: {e}")
+            continue
+    # If all retries failed, return empty string to indicate drop
+    return ""

utils/processor.py CHANGED Viewed

@@ -113,6 +113,77 @@ def _build_variants(user: str, out: str, paraphraser, opts: Dict, stats: Dict):
         variants.append((u3, o3, applied))
     return variants
 def _apply_aug(instr: str, user: str, out: str, source: str, opts: Dict, paraphraser, stats: Dict):
     # Base cleanup & caps (returns cleaned strings)
     user = A.base_cleanup(user, opts.get("max_chars", 5000), opts.get("deidentify", True))
@@ -126,6 +197,13 @@ def _apply_aug(instr: str, user: str, out: str, source: str, opts: Dict, paraphr
     # Stack list of entries that has been applied augmentation and stylings
     applied = []
     # Style standardizing the answer
     if opts.get("style_standardize", True):
         out = A.style_standardize_answer(out)
@@ -188,6 +266,11 @@ def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stat
         try:
             instr, user, out, applied = _apply_aug(instr, user, out, source, opts, paraphraser, stats)
             # 1) ALWAYS write the original (cleaned/style-standardised only)
             # Optional consistency spot-check (cheap)
             if not A.consistency_ok(user, out, opts.get("consistency_check_ratio", 0.0), paraphraser):
@@ -195,12 +278,15 @@ def _proc_med_dialog(source, path, writer, paraphraser, opts, sample_limit, stat
                 # keep the sample but tag it
                 applied.append("consistency_flag")
-            # 2) If expansion is enabled, add augmented copies
             _commit_row(writer, source, rid, "medical_dialogue", instr, user, out, opts, stats, ["base"] + applied, dedupe_seen=dedupe_seen, translator=translator)
-            # Add augmented copies if expand
             if opts.get("expand", True):
-                for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
-                    rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, source, rid_aug, "medical_dialogue", instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
@@ -247,11 +333,19 @@ def _proc_pubmedqa_l(path, writer, paraphraser, opts, sample_limit, stats, cb, d
             rid   = str(k)
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_l", opts, paraphraser, stats)
             _commit_row(writer, "pubmedqa_l", rid, "biomedical_qa", instr, user, out, opts, stats, applied,
                         extra_meta={"year": v.get("YEAR"), "meshes": v.get("MESHES"), "labels": v.get("LABELS")}, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
-                for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
-                    rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_l", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
@@ -302,10 +396,18 @@ def _proc_pubmedqa_u(path, writer, paraphraser, opts, sample_limit, stats, cb, d
                     out = guess.strip()
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_u", opts, paraphraser, stats)
             _commit_row(writer, "pubmedqa_u", str(k), "biomedical_qa_unlabeled", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
-                for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
-                    rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_u", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
@@ -395,12 +497,20 @@ def _proc_pubmedqa_map(path, writer, paraphraser, opts, sample_limit, stats, cb,
             # Process the item
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_map", opts, paraphraser, stats)
             _commit_row(writer, "pubmedqa_map", rid, "biomedical_qa", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             # Handle expansion if enabled
             if opts.get("expand", True):
-                for (u_aug, o_aug, aug_tags) in _build_variants(user, out, paraphraser, opts, stats):
-                    rid_aug = f"{rid}-aug{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_map", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)

         variants.append((u3, o3, applied))
     return variants
+def _build_enriched_variants(user: str, out: str, paraphraser, opts: Dict, stats: Dict, translator=None):
+    """Build multiple paraphrased variants for SFT enrichment (2-3 answers per question, 2-3 questions per answer)"""
+    variants = []
+    # Generate 2-3 different answers for the same question
+    answer_variants = []
+    for i in range(3):
+        if i == 0:
+            # Original answer
+            answer_variants.append((out, ["original_answer"]))
+        else:
+            # Paraphrased answers with different difficulties
+            difficulty = "easy" if i == 1 else "hard"
+            try:
+                paraphrased_out = paraphraser.paraphrase(out, difficulty=difficulty)
+                if paraphrased_out and not A.is_invalid_response(paraphrased_out):
+                    if opts.get("style_standardize", True):
+                        paraphrased_out = A.style_standardize_answer(paraphrased_out)
+                    paraphrased_out = A.ensure_terminal_punct(paraphrased_out)
+                    answer_variants.append((paraphrased_out, [f"paraphrase_answer_{difficulty}"]))
+                    stats["paraphrased_output"] += 1
+            except Exception as e:
+                logger.warning(f"Failed to paraphrase answer variant {i}: {e}")
+                continue
+    # Generate 2-3 different questions for the same answer
+    question_variants = []
+    for i in range(3):
+        if i == 0:
+            # Original question
+            question_variants.append((user, ["original_question"]))
+        else:
+            # Paraphrased questions with different difficulties
+            difficulty = "easy" if i == 1 else "hard"
+            try:
+                paraphrased_user = paraphraser.paraphrase(user, difficulty=difficulty)
+                if paraphrased_user and not A.is_invalid_response(paraphrased_user):
+                    paraphrased_user = A.ensure_terminal_punct(paraphrased_user)
+                    question_variants.append((paraphrased_user, [f"paraphrase_question_{difficulty}"]))
+                    stats["paraphrased_input"] += 1
+            except Exception as e:
+                logger.warning(f"Failed to paraphrase question variant {i}: {e}")
+                continue
+    # Create combinations: each question variant with each answer variant
+    for q_user, q_tags in question_variants:
+        for a_out, a_tags in answer_variants:
+            combined_tags = q_tags + a_tags
+            variants.append((q_user, a_out, combined_tags))
+    # Add Vietnamese variants if translator is available
+    if translator and translator.is_loaded():
+        vi_variants = []
+        for q_user, a_out, tags in variants[:3]:  # Limit to first 3 to avoid too many variants
+            try:
+                # Translate question and answer
+                vi_q = translator.translate_text(q_user)
+                vi_a = translator.translate_text(a_out)
+                if vi_q and vi_a and not A.is_invalid_response(vi_q) and not A.is_invalid_response(vi_a):
+                    vi_tags = tags + ["vietnamese_translated"]
+                    vi_variants.append((vi_q, vi_a, vi_tags))
+                    stats["vietnamese_variants"] = stats.get("vietnamese_variants", 0) + 1
+            except Exception as e:
+                logger.warning(f"Failed to create Vietnamese variant: {e}")
+                continue
+        variants.extend(vi_variants)
+    return variants
 def _apply_aug(instr: str, user: str, out: str, source: str, opts: Dict, paraphraser, stats: Dict):
     # Base cleanup & caps (returns cleaned strings)
     user = A.base_cleanup(user, opts.get("max_chars", 5000), opts.get("deidentify", True))
     # Stack list of entries that has been applied augmentation and stylings
     applied = []
+    # Clean invalid responses with retry logic
+    if A.is_invalid_response(out):
+        out = A.retry_invalid_response(out, paraphraser, max_retries=3)
+        if not out:  # If retry failed, return empty to indicate drop
+            return instr, user, "", applied
+        applied.append("invalid_response_retried")
     # Style standardizing the answer
     if opts.get("style_standardize", True):
         out = A.style_standardize_answer(out)
         try:
             instr, user, out, applied = _apply_aug(instr, user, out, source, opts, paraphraser, stats)
+            # Skip if retry failed (empty output)
+            if not out:
+                stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
+                continue
             # 1) ALWAYS write the original (cleaned/style-standardised only)
             # Optional consistency spot-check (cheap)
             if not A.consistency_ok(user, out, opts.get("consistency_check_ratio", 0.0), paraphraser):
                 # keep the sample but tag it
                 applied.append("consistency_flag")
+            # 2) If expansion is enabled, add enriched variants for SFT
             _commit_row(writer, source, rid, "medical_dialogue", instr, user, out, opts, stats, ["base"] + applied, dedupe_seen=dedupe_seen, translator=translator)
+            # Add enriched variants if expand is enabled
             if opts.get("expand", True):
+                # Use enriched variants for SFT (multiple Q&A combinations)
+                enriched_variants = _build_enriched_variants(user, out, paraphraser, opts, stats, translator)
+                for (u_aug, o_aug, aug_tags) in enriched_variants:
+                    rid_aug = f"{rid}-enriched{random.randint(1000,9999)}"
                     _commit_row(writer, source, rid_aug, "medical_dialogue", instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Increment count only on success
             rid   = str(k)
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_l", opts, paraphraser, stats)
+            # Skip if retry failed (empty output)
+            if not out:
+                stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
+                continue
             _commit_row(writer, "pubmedqa_l", rid, "biomedical_qa", instr, user, out, opts, stats, applied,
                         extra_meta={"year": v.get("YEAR"), "meshes": v.get("MESHES"), "labels": v.get("LABELS")}, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
+                # Use enriched variants for SFT (multiple Q&A combinations)
+                enriched_variants = _build_enriched_variants(user, out, paraphraser, opts, stats, translator)
+                for (u_aug, o_aug, aug_tags) in enriched_variants:
+                    rid_aug = f"{rid}-enriched{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_l", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
                     out = guess.strip()
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_u", opts, paraphraser, stats)
+            # Skip if retry failed (empty output)
+            if not out:
+                stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
+                continue
             _commit_row(writer, "pubmedqa_u", str(k), "biomedical_qa_unlabeled", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             if opts.get("expand", True):
+                # Use enriched variants for SFT (multiple Q&A combinations)
+                enriched_variants = _build_enriched_variants(user, out, paraphraser, opts, stats, translator)
+                for (u_aug, o_aug, aug_tags) in enriched_variants:
+                    rid_aug = f"{rid}-enriched{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_u", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)
             # Process the item
             instr, user, out, applied = _apply_aug(instr, user, out, "pubmedqa_map", opts, paraphraser, stats)
+            # Skip if retry failed (empty output)
+            if not out:
+                stats["dropped_invalid"] = stats.get("dropped_invalid", 0) + 1
+                continue
             _commit_row(writer, "pubmedqa_map", rid, "biomedical_qa", instr, user, out, opts, stats, applied, dedupe_seen=dedupe_seen, translator=translator)
             # Handle expansion if enabled
             if opts.get("expand", True):
+                # Use enriched variants for SFT (multiple Q&A combinations)
+                enriched_variants = _build_enriched_variants(user, out, paraphraser, opts, stats, translator)
+                for (u_aug, o_aug, aug_tags) in enriched_variants:
+                    rid_aug = f"{rid}-enriched{random.randint(1000,9999)}"
                     _commit_row(writer, "pubmedqa_map", rid_aug, "biomedical_qa",
                                 instr, u_aug, o_aug, opts, stats, aug_tags, dedupe_seen=dedupe_seen, translator=translator)

utils/rag.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import Dict, List, Tuple, Optional, Callable
 from utils.schema import sft_row, rag_row
 from utils.llm import NvidiaClient, KeyRotator
-from vi.processing import should_translate
 # Logger
 logger = logging.getLogger("rag_processor")
@@ -190,6 +191,15 @@ class RAGProcessor:
                 # Convert to QCA format
                 question, context, answer = self.convert_to_qca_format(instr, user, out)
                 if not question or not answer:
                     continue
@@ -246,6 +256,15 @@ class RAGProcessor:
                 context = self.clean_conversational_content(context)
                 answer = self.clean_conversational_content(answer)
                 # Generate context if missing
                 if not context:
                     context = self.generate_context_from_qa(question, answer)
@@ -289,9 +308,8 @@ class RAGProcessor:
         # Apply Vietnamese translation if requested (translate Q/A/C fields directly)
         if should_translate(opts.get("vietnamese_translation", False) if opts else False, translator):
             try:
-                if translator:
-                    row = translator.translate_dict(row, ["question", "answer", "context"])
-                    row["vi_translated"] = True
             except Exception as e:
                 logger.error(f"Failed to translate RAG row: {e}")
@@ -307,7 +325,8 @@ def process_file_into_rag(
     sample_limit: Optional[int],
     seed: int,
     progress_cb: Optional[Callable[[float, str], None]],
-    translator=None
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)

 from utils.schema import sft_row, rag_row
 from utils.llm import NvidiaClient, KeyRotator
+from vi.processing import should_translate, translate_rag_row
+from utils import augment as A
 # Logger
 logger = logging.getLogger("rag_processor")
                 # Convert to QCA format
                 question, context, answer = self.convert_to_qca_format(instr, user, out)
+                # Clean invalid responses with retry logic
+                if A.is_invalid_response(answer):
+                    if paraphraser:
+                        answer = A.retry_invalid_response(answer, paraphraser, max_retries=3)
+                    else:
+                        answer = A.clean_invalid_response(answer, "")
+                    if not answer:  # If retry failed, skip this sample
+                        continue
                 if not question or not answer:
                     continue
                 context = self.clean_conversational_content(context)
                 answer = self.clean_conversational_content(answer)
+                # Clean invalid responses with retry logic
+                if A.is_invalid_response(answer):
+                    if paraphraser:
+                        answer = A.retry_invalid_response(answer, paraphraser, max_retries=3)
+                    else:
+                        answer = A.clean_invalid_response(answer, "")
+                    if not answer:  # If retry failed, skip this sample
+                        continue
                 # Generate context if missing
                 if not context:
                     context = self.generate_context_from_qa(question, answer)
         # Apply Vietnamese translation if requested (translate Q/A/C fields directly)
         if should_translate(opts.get("vietnamese_translation", False) if opts else False, translator):
             try:
+                row = translate_rag_row(row, translator, ["question", "answer", "context"])
+                row["vi_translated"] = True
             except Exception as e:
                 logger.error(f"Failed to translate RAG row: {e}")
     sample_limit: Optional[int],
     seed: int,
     progress_cb: Optional[Callable[[float, str], None]],
+    translator=None,
+    paraphraser=None
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)

vi/processing.py CHANGED Viewed

@@ -31,6 +31,34 @@ def _vi_sanitize_text(s: str) -> str:
         t = " ".join(filtered)
     return t
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
     Translate specific text fields in an SFT row from English to Vietnamese.
@@ -53,10 +81,16 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
     try:
         translated_row = translator.translate_dict(row, text_fields)
-        # Sanitize translated fields
         for f in text_fields:
             if f in translated_row.get("sft", {}):
-                translated_row["sft"][f] = _vi_sanitize_text(translated_row["sft"][f])
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:
@@ -85,10 +119,16 @@ def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] =
     try:
         translated_row = translator.translate_dict(row, text_fields)
-        # Sanitize translated fields
         for f in text_fields:
             if f in translated_row:
-                translated_row[f] = _vi_sanitize_text(translated_row[f])
         logger.debug(f"Translated RAG row with fields: {text_fields}")
         return translated_row
     except Exception as e:

         t = " ".join(filtered)
     return t
+def _validate_vi_translation(original: str, translated: str) -> bool:
+    """Validate Vietnamese translation quality"""
+    if not translated or not isinstance(translated, str):
+        return False
+    # Check if translation is too short or too different in length
+    if len(translated.strip()) < 3:
+        return False
+    # Check if translation contains too much English (should be mostly Vietnamese)
+    import re
+    english_chars = len(re.findall(r'[a-zA-Z]', translated))
+    total_chars = len(re.sub(r'\s', '', translated))
+    if total_chars > 0 and english_chars / total_chars > 0.7:
+        return False
+    # Check for common translation failure patterns
+    failure_patterns = [
+        "translation", "error", "failed", "unable", "cannot",
+        "not available", "not found", "invalid", "error"
+    ]
+    translated_lower = translated.lower()
+    for pattern in failure_patterns:
+        if pattern in translated_lower:
+            return False
+    return True
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
     Translate specific text fields in an SFT row from English to Vietnamese.
     try:
         translated_row = translator.translate_dict(row, text_fields)
+        # Validate and sanitize translated fields
         for f in text_fields:
             if f in translated_row.get("sft", {}):
+                original = row.get("sft", {}).get(f, "")
+                translated = translated_row["sft"][f]
+                if _validate_vi_translation(original, translated):
+                    translated_row["sft"][f] = _vi_sanitize_text(translated)
+                else:
+                    logger.warning(f"Invalid Vietnamese translation for field {f}, keeping original")
+                    translated_row["sft"][f] = original
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:
     try:
         translated_row = translator.translate_dict(row, text_fields)
+        # Validate and sanitize translated fields
         for f in text_fields:
             if f in translated_row:
+                original = row.get(f, "")
+                translated = translated_row[f]
+                if _validate_vi_translation(original, translated):
+                    translated_row[f] = _vi_sanitize_text(translated)
+                else:
+                    logger.warning(f"Invalid Vietnamese translation for field {f}, keeping original")
+                    translated_row[f] = original
         logger.debug(f"Translated RAG row with fields: {text_fields}")
         return translated_row
     except Exception as e: