Spaces:
Sleeping
Sleeping
| # HF dataset download resolver + downloader | |
| import os | |
| from typing import Optional | |
| from huggingface_hub import hf_hub_download | |
| import logging | |
| # Logger | |
| logger = logging.getLogger("datasets") | |
| if not logger.handlers: | |
| logger.setLevel(logging.INFO) | |
| logger.addHandler(logging.StreamHandler()) | |
| DATASETS = { | |
| "healthcaremagic": { | |
| "repo_id": "BinKhoaLe1812/MedDialog-EN-100k", | |
| "filename": "HealthCareMagic-100k.json", | |
| "repo_type": "dataset" | |
| }, | |
| "icliniq": { | |
| "repo_id": "BinKhoaLe1812/MedDialog-EN-10k", | |
| "filename": "iCliniq.json", | |
| "repo_type": "dataset" | |
| }, | |
| "pubmedqa_l": { | |
| "repo_id": "BinKhoaLe1812/PubMedQA-L", | |
| "filename": "ori_pqal.json", | |
| "repo_type": "dataset" | |
| }, | |
| "pubmedqa_u": { | |
| "repo_id": "BinKhoaLe1812/PubMedQA-U", | |
| "filename": "ori_pqau.json", | |
| "repo_type": "dataset" | |
| }, | |
| "pubmedqa_map": { | |
| "repo_id": "BinKhoaLe1812/PubMedQA-Map", | |
| "filename": "pubmed_qa_map.json", | |
| "repo_type": "dataset" | |
| } | |
| } | |
| def resolve_dataset(key: str) -> Optional[dict]: | |
| return DATASETS.get(key.lower()) | |
| def hf_download_dataset(repo_id: str, filename: str, repo_type: str = "dataset") -> str: | |
| token = os.getenv("HF_TOKEN") | |
| logger.info( | |
| f"[HF] Download {repo_id}/{filename} (type={repo_type}) token={'yes' if token else 'no'}" | |
| ) | |
| # Set cache directory with proper permissions | |
| cache_dir = os.path.abspath("cache/hf") | |
| os.makedirs(cache_dir, exist_ok=True) | |
| # Set HF_HOME to avoid permission issues | |
| hf_home = os.path.abspath("cache/huggingface") | |
| os.makedirs(hf_home, exist_ok=True) | |
| os.environ["HF_HOME"] = hf_home | |
| path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| repo_type=repo_type, | |
| token=token, | |
| local_dir=cache_dir, | |
| local_dir_use_symlinks=False | |
| ) | |
| try: | |
| size = os.path.getsize(path) | |
| logger.info(f"[HF] Downloaded to {path} size={size} bytes") | |
| except Exception: | |
| logger.info(f"[HF] Downloaded to {path}") | |
| return path | |