MedAI_Processing / utils /datasets.py
LiamKhoaLe's picture
Upd caching + trans SFT saver
4056c2c
# HF dataset download resolver + downloader
import os
from typing import Optional
from huggingface_hub import hf_hub_download
import logging
# Logger
logger = logging.getLogger("datasets")
if not logger.handlers:
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
DATASETS = {
"healthcaremagic": {
"repo_id": "BinKhoaLe1812/MedDialog-EN-100k",
"filename": "HealthCareMagic-100k.json",
"repo_type": "dataset"
},
"icliniq": {
"repo_id": "BinKhoaLe1812/MedDialog-EN-10k",
"filename": "iCliniq.json",
"repo_type": "dataset"
},
"pubmedqa_l": {
"repo_id": "BinKhoaLe1812/PubMedQA-L",
"filename": "ori_pqal.json",
"repo_type": "dataset"
},
"pubmedqa_u": {
"repo_id": "BinKhoaLe1812/PubMedQA-U",
"filename": "ori_pqau.json",
"repo_type": "dataset"
},
"pubmedqa_map": {
"repo_id": "BinKhoaLe1812/PubMedQA-Map",
"filename": "pubmed_qa_map.json",
"repo_type": "dataset"
}
}
def resolve_dataset(key: str) -> Optional[dict]:
return DATASETS.get(key.lower())
def hf_download_dataset(repo_id: str, filename: str, repo_type: str = "dataset") -> str:
token = os.getenv("HF_TOKEN")
logger.info(
f"[HF] Download {repo_id}/{filename} (type={repo_type}) token={'yes' if token else 'no'}"
)
# Set cache directory with proper permissions
cache_dir = os.path.abspath("cache/hf")
os.makedirs(cache_dir, exist_ok=True)
# Set HF_HOME to avoid permission issues
hf_home = os.path.abspath("cache/huggingface")
os.makedirs(hf_home, exist_ok=True)
os.environ["HF_HOME"] = hf_home
path = hf_hub_download(
repo_id=repo_id,
filename=filename,
repo_type=repo_type,
token=token,
local_dir=cache_dir,
local_dir_use_symlinks=False
)
try:
size = os.path.getsize(path)
logger.info(f"[HF] Downloaded to {path} size={size} bytes")
except Exception:
logger.info(f"[HF] Downloaded to {path}")
return path