Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 3

Commit

19d62ff

1 Parent(s): 050b5e3

Upd RAG schema to QAC format

Browse files

Files changed (5) hide show

app.py +11 -10
utils/augment.py +1 -1
utils/llm.py +3 -3
utils/rag.py +17 -37
utils/schema.py +54 -1

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from utils.processor import process_file_into_sft
 from utils.rag import process_file_into_rag
 from utils.drive_saver import DriveSaver
 from utils.llm import Paraphraser
-from utils.schema import CentralisedWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
 from vi.translator import VietnameseTranslator
@@ -71,14 +71,14 @@ STATE: Dict[str, object] = {
 class AugmentOptions(BaseModel):
     # ratios are 0..1
-    paraphrase_ratio: float = 0.0
-    paraphrase_outputs: bool = False
-    backtranslate_ratio: float = 0.0
     style_standardize: bool = True
     deidentify: bool = True
     dedupe: bool = True
     max_chars: int = 5000                 # cap extremely long contexts
-    consistency_check_ratio: float = 0.0  # small ratio e.g. 0.01
     # KD / distillation (optional, keeps default off)
     distill_fraction: float = 0.0         # for unlabeled only
     expand: bool = True                   # Enable back-translation and complex augmentation
@@ -178,15 +178,16 @@ def root():
               headers: {{ "Content-Type": "application/json" }},
               body: JSON.stringify({{
                 augment: {{
-                  paraphrase_ratio: 0.1,
-                  backtranslate_ratio: 0.00, // Increase to 0.05-0.1 for back-translation
-                  paraphrase_outputs: false,
                   style_standardize: true,
                   deidentify: true,
                   dedupe: true,
                   max_chars: 5000,
                   expand: true,
-                  max_aug_per_sample: 2
                 }},
                 sample_limit: null,          // Sample down (currently disabled)
                 seed: 42,
@@ -382,7 +383,7 @@ def _run_job(dataset_key: str, params: ProcessParams):
         set_state(message="processing", progress=0.05)
         # Writer
-        writer = CentralisedWriter(jsonl_path=jsonl_path, csv_path=csv_path)
         # Load translator if Vietnamese translation is requested
         translator = None

 from utils.rag import process_file_into_rag
 from utils.drive_saver import DriveSaver
 from utils.llm import Paraphraser
+from utils.schema import CentralisedWriter, RAGWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
 from vi.translator import VietnameseTranslator
 class AugmentOptions(BaseModel):
     # ratios are 0..1
+    paraphrase_ratio: float = 0.2
+    paraphrase_outputs: bool = True
+    backtranslate_ratio: float = 0.1
     style_standardize: bool = True
     deidentify: bool = True
     dedupe: bool = True
     max_chars: int = 5000                 # cap extremely long contexts
+    consistency_check_ratio: float = 0.05  # small ratio e.g. 0.01
     # KD / distillation (optional, keeps default off)
     distill_fraction: float = 0.0         # for unlabeled only
     expand: bool = True                   # Enable back-translation and complex augmentation
               headers: {{ "Content-Type": "application/json" }},
               body: JSON.stringify({{
                 augment: {{
+                  paraphrase_ratio: 0.2,
+                  backtranslate_ratio: 0.1,
+                  paraphrase_outputs: true,
                   style_standardize: true,
                   deidentify: true,
                   dedupe: true,
                   max_chars: 5000,
                   expand: true,
+                  max_aug_per_sample: 2,
+                  consistency_check_ratio: 0.05
                 }},
                 sample_limit: null,          // Sample down (currently disabled)
                 seed: 42,
         set_state(message="processing", progress=0.05)
         # Writer
+        writer = RAGWriter(jsonl_path=jsonl_path, csv_path=csv_path) if params.rag_processing else CentralisedWriter(jsonl_path=jsonl_path, csv_path=csv_path)
         # Load translator if Vietnamese translation is requested
         translator = None

utils/augment.py CHANGED Viewed

@@ -93,7 +93,7 @@ def maybe_paraphrase(text: str, ratio: float, paraphraser, difficulty: str) -> T
 def maybe_backtranslate(text: str, ratio: float, paraphraser) -> Tuple[str, bool]:
     if ratio <= 0 or not text: return text, False
     if random.random() < ratio:
-        bt = paraphraser.backtranslate(text, via_lang="de")
         return bt if bt else text, bool(bt)
     return text, False

 def maybe_backtranslate(text: str, ratio: float, paraphraser) -> Tuple[str, bool]:
     if ratio <= 0 or not text: return text, False
     if random.random() < ratio:
+        bt = paraphraser.backtranslate(text, via_lang="vi")
         return bt if bt else text, bool(bt)
     return text, False

utils/llm.py CHANGED Viewed

@@ -154,18 +154,18 @@ class Paraphraser:
         return self._clean_resp(out) if out else text
     # ————— Translate & Backtranslate —————
-    def translate(self, text: str, target_lang: str = "de") -> Optional[str]:
         if not text: return text
         prompt = f"Translate to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
         out = self.nv.generate(prompt, temperature=0.0, max_tokens=min(800, len(text)+100))
         if out: return out.strip()
         return self.gm_easy.generate(prompt, max_output_tokens=min(800, len(text)+100))
-    def backtranslate(self, text: str, via_lang: str = "de") -> Optional[str]:
         if not text: return text
         mid = self.translate(text, target_lang=via_lang)
         if not mid: return None
-        prompt = f"Translate the following {via_lang} text back to English, preserving the exact meaning:\n\n{mid}"
         out = self.nv.generate(prompt, temperature=0.0, max_tokens=min(900, len(text)+150))
         if out: return out.strip()
         res = self.gm_easy.generate(prompt, max_output_tokens=min(900, len(text)+150))

         return self._clean_resp(out) if out else text
     # ————— Translate & Backtranslate —————
+    def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
         if not text: return text
         prompt = f"Translate to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
         out = self.nv.generate(prompt, temperature=0.0, max_tokens=min(800, len(text)+100))
         if out: return out.strip()
         return self.gm_easy.generate(prompt, max_output_tokens=min(800, len(text)+100))
+    def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
         if not text: return text
         mid = self.translate(text, target_lang=via_lang)
         if not mid: return None
+        prompt = f"Translate the following Vietnamese text back to English, preserving the exact meaning:\n\n{mid}"
         out = self.nv.generate(prompt, temperature=0.0, max_tokens=min(900, len(text)+150))
         if out: return out.strip()
         res = self.gm_easy.generate(prompt, max_output_tokens=min(900, len(text)+150))

utils/rag.py CHANGED Viewed

@@ -5,9 +5,9 @@ import hashlib
 import random
 from typing import Dict, List, Tuple, Optional, Callable
-from utils.schema import sft_row
 from utils.llm import NvidiaClient, KeyRotator
-from vi.processing import translate_rag_row, should_translate, log_translation_stats
 # Logger
 logger = logging.getLogger("rag_processor")
@@ -188,18 +188,8 @@ class RAGProcessor:
                 if not question or not answer:
                     continue
-                # Create RAG-specific instruction
-                rag_instruction = "Answer the medical question based on the provided context. If the context is insufficient, provide the best available medical information."
-                # Format user input as QCA
-                if context:
-                    rag_user = f"Question: {question}\n\nContext: {context}"
-                else:
-                    rag_user = f"Question: {question}"
-                # Commit the RAG-formatted row
-                if self._commit_rag_row(writer, source, rid, "rag_medical_qa",
-                                      rag_instruction, rag_user, answer,
                                       stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
@@ -256,16 +246,8 @@ class RAGProcessor:
                     context = self.generate_context_from_qa(question, answer)
                 rid = str(k)
-                rag_instruction = "Answer the biomedical question based on the provided context."
-                if context:
-                    rag_user = f"Question: {question}\n\nContext: {context}"
-                else:
-                    rag_user = f"Question: {question}"
-                # Commit the RAG-formatted row
-                if self._commit_rag_row(writer, source, rid, "rag_biomedical_qa",
-                                      rag_instruction, rag_user, answer,
                                       stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
@@ -286,30 +268,28 @@ class RAGProcessor:
         logger.info(f"[RAG] {source} RAG processing done count={count} written={written}")
         return count
-    def _commit_rag_row(self, writer, source: str, rid: str, task: str,
-                       instruction: str, user_input: str, output: str,
                        stats: Dict, dedupe_seen: set = None, translator=None, opts=None) -> bool:
-        """Commit a RAG-formatted row to the writer"""
         # Simple deduplication based on content hash
         if dedupe_seen is not None:
-            content_hash = hashlib.md5(f"{user_input}{output}".encode()).hexdigest()
             if content_hash in dedupe_seen:
                 stats["dedup_skipped"] = stats.get("dedup_skipped", 0) + 1
                 return False
             dedupe_seen.add(content_hash)
-        meta = {"rag_processing": True, "format": "qca"}
-        row = sft_row(instruction, user_input, output, source=source, rid=rid, task=task, meta=meta)
-        # Apply Vietnamese translation if requested
         if should_translate(opts.get("vietnamese_translation", False) if opts else False, translator):
             try:
-                row = translate_rag_row(row, translator)
-                meta["vietnamese_translated"] = True
-                row["meta"] = meta
             except Exception as e:
                 logger.error(f"Failed to translate RAG row: {e}")
         writer.write(row)
         stats["written"] = stats.get("written", 0) + 1
         return True

 import random
 from typing import Dict, List, Tuple, Optional, Callable
+from utils.schema import sft_row, rag_row
 from utils.llm import NvidiaClient, KeyRotator
+from vi.processing import should_translate
 # Logger
 logger = logging.getLogger("rag_processor")
                 if not question or not answer:
                     continue
+                # Commit the RAG-formatted row (QAC)
+                if self._commit_rag_row(writer, rid, question, context, answer,
                                       stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
                     context = self.generate_context_from_qa(question, answer)
                 rid = str(k)
+                # Commit the RAG-formatted row (QAC)
+                if self._commit_rag_row(writer, rid, question, context, answer,
                                       stats, dedupe_seen=dedupe_seen, translator=translator, opts=opts):
                     written += 1
         logger.info(f"[RAG] {source} RAG processing done count={count} written={written}")
         return count
+    def _commit_rag_row(self, writer, rid: str, question: str, context: str, answer: str,
                        stats: Dict, dedupe_seen: set = None, translator=None, opts=None) -> bool:
+        """Commit a RAG-formatted row (QAC) to the writer"""
         # Simple deduplication based on content hash
         if dedupe_seen is not None:
+            content_hash = hashlib.md5(f"{question}{context}{answer}".encode()).hexdigest()
             if content_hash in dedupe_seen:
                 stats["dedup_skipped"] = stats.get("dedup_skipped", 0) + 1
                 return False
             dedupe_seen.add(content_hash)
+        row = rag_row(question=question, context=context, answer=answer, rid=rid)
+        # Apply Vietnamese translation if requested (translate Q/A/C fields directly)
         if should_translate(opts.get("vietnamese_translation", False) if opts else False, translator):
             try:
+                if translator:
+                    row = translator.translate_dict(row, ["question", "answer", "context"])
+                    row["vi_translated"] = True
             except Exception as e:
                 logger.error(f"Failed to translate RAG row: {e}")
         writer.write(row)
         stats["written"] = stats.get("written", 0) + 1
         return True

utils/schema.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Centralized SFT writer (JSONL + CSV)
 import csv
 import orjson
 from typing import Optional, Dict
@@ -66,3 +66,56 @@ class CentralisedWriter:
             self.jsonl_fp.close()
         finally:
             self.csv_fp.close()

+# Centralized SFT writer (JSONL + CSV) and RAG writer
 import csv
 import orjson
 from typing import Optional, Dict
             self.jsonl_fp.close()
         finally:
             self.csv_fp.close()
+# —— RAG (QAC) schema ——
+def rag_row(question: str, context: str, answer: str, rid: str):
+    return {
+        "id": rid,
+        "question": question or "",
+        "answer": answer or "",
+        "context": context or ""
+    }
+def is_valid_rag_row(row: Dict, max_chars: int = 20000) -> bool:
+    q = row.get("question", "")
+    a = row.get("answer", "")
+    c = row.get("context", "")
+    if not (q and a):
+        return False
+    if any(len(x) > max_chars for x in (q, a, c)):
+        return False
+    return True
+class RAGWriter:
+    """Streams JSONL + CSV for RAG (QAC) format with columns: id, question, answer, context."""
+    def __init__(self, jsonl_path: str, csv_path: str):
+        self.jsonl_fp = open(jsonl_path, "wb")
+        self.csv_fp   = open(csv_path, "w", newline="", encoding="utf-8")
+        self.csv_wr   = csv.DictWriter(self.csv_fp, fieldnames=["id","question","answer","context"])
+        self.csv_wr.writeheader()
+    def write(self, row: dict):
+        if not is_valid_rag_row(row):
+            logger.warning(
+                f"[RAG-WRITER] Skipping invalid row id={row.get('id')} "
+                f"(len q={len(row.get('question',''))}, a={len(row.get('answer',''))}, c={len(row.get('context',''))})"
+            )
+            return
+        self.jsonl_fp.write(orjson.dumps(row))
+        self.jsonl_fp.write(b"\n")
+        self.csv_wr.writerow({
+            "id": row.get("id",""),
+            "question": row.get("question",""),
+            "answer": row.get("answer",""),
+            "context": row.get("context","")
+        })
+    def close(self):
+        try:
+            self.jsonl_fp.close()
+        finally:
+            self.csv_fp.close()