Spaces:
Sleeping
Sleeping
| """ | |
| Build script optimized for Hugging Face Spaces deployment | |
| Maintains the exact same SOTA RAG architecture | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import pickle | |
| import json | |
| import numpy as np | |
| import torch | |
| from pathlib import Path | |
| # Add parent directory to path | |
| sys.path.append('.') | |
| from app import ( | |
| load_opc_datasets, | |
| build_retrieval_system, | |
| ARTIFACT_DIR, | |
| FAISS_AVAILABLE, | |
| MODEL_NAME, | |
| EMBED_MODEL, | |
| MAX_CORPUS_SIZE | |
| ) | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout), | |
| logging.FileHandler('/data/build.log') | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def check_artifacts(): | |
| """Check if artifacts already exist""" | |
| required_files = [ | |
| "corpus_data.json", | |
| "corpus_embeddings.npy", | |
| "answer_embeddings.npy", | |
| "bm25.pkl" | |
| ] | |
| if FAISS_AVAILABLE: | |
| required_files.append("faiss_index.bin") | |
| all_exist = all(os.path.exists(os.path.join(ARTIFACT_DIR, f)) for f in required_files) | |
| return all_exist | |
| def build_retrieval_with_progress(): | |
| """Build retrieval system with progress tracking""" | |
| logger.info("Building SOTA RAG Retrieval System for Coding Assistant") | |
| logger.info(f"Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval") | |
| logger.info(f"Embedding Model: {EMBED_MODEL}") | |
| logger.info(f"Max Corpus Size: {MAX_CORPUS_SIZE}") | |
| # Load datasets | |
| logger.info("Loading coding datasets...") | |
| ds_map = load_opc_datasets() | |
| # Build retrieval system (using the exact same function from app.py) | |
| logger.info("Building retrieval system...") | |
| retrieval_system = build_retrieval_system(ds_map) | |
| logger.info("Retrieval system built successfully!") | |
| logger.info(f" - Corpus size: {len(retrieval_system.corpus_texts)}") | |
| logger.info(f" - Embedding dimension: {retrieval_system.corpus_embeddings.shape[1]}") | |
| logger.info(f" - FAISS index: {'Yes' if retrieval_system.faiss_index else 'No'}") | |
| return retrieval_system | |
| def prepare_llm_artifacts(): | |
| """Prepare LLM artifacts without downloading the full model""" | |
| logger.info("π€ Preparing LLM configuration...") | |
| from transformers import AutoTokenizer, GenerationConfig | |
| llm_path = os.path.join(ARTIFACT_DIR, "llm_model") | |
| os.makedirs(llm_path, exist_ok=True) | |
| # Download and save tokenizer | |
| logger.info(f"π₯ Downloading tokenizer for {MODEL_NAME}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Use the exact same chat template from app.py | |
| tokenizer.chat_template = ( | |
| "{% for message in messages %}" | |
| "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}" | |
| "{% endfor %}" | |
| "{% if add_generation_prompt %}" | |
| "<|assistant|>\n" | |
| "{% endif %}" | |
| ) | |
| # Use the exact same generation config from app.py | |
| generation_config = GenerationConfig( | |
| max_new_tokens=300, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| repetition_penalty=1.15, | |
| pad_token_id=tokenizer.pad_token_id | |
| ) | |
| # Save tokenizer and config | |
| tokenizer.save_pretrained(llm_path) | |
| generation_config.save_pretrained(llm_path) | |
| # Create minimal config file | |
| config = { | |
| "_name_or_path": MODEL_NAME, | |
| "architectures": ["LlamaForCausalLM"], | |
| "model_type": "llama", | |
| "torch_dtype": "float16", | |
| "quantization_config": { | |
| "load_in_4bit": True, | |
| "bnb_4bit_compute_dtype": "float32", | |
| "bnb_4bit_use_double_quant": True, | |
| "bnb_4bit_quant_type": "nf4" | |
| } if torch.cuda.is_available() else {} | |
| } | |
| config_path = os.path.join(llm_path, "config.json") | |
| with open(config_path, "w") as f: | |
| json.dump(config, f, indent=2) | |
| logger.info(f"LLM configuration saved to {llm_path}") | |
| logger.info("Note: Full model will be downloaded at runtime with 4-bit quantization") | |
| def verify_artifacts(): | |
| """Verify all artifacts are properly built""" | |
| logger.info("Verifying artifacts...") | |
| files_to_check = { | |
| "corpus_data.json": "Corpus data", | |
| "corpus_embeddings.npy": "Question embeddings", | |
| "answer_embeddings.npy": "Answer embeddings", | |
| "bm25.pkl": "BM25 index", | |
| "faiss_index.bin": "FAISS index" | |
| } | |
| for file, description in files_to_check.items(): | |
| path = os.path.join(ARTIFACT_DIR, file) | |
| if os.path.exists(path): | |
| size_mb = os.path.getsize(path) / (1024 * 1024) | |
| logger.info(f" β {description}: {size_mb:.2f} MB") | |
| else: | |
| if file != "faiss_index.bin" or FAISS_AVAILABLE: | |
| logger.warning(f" β Missing: {description}") | |
| def main(): | |
| """Main build process""" | |
| logger.info("=" * 60) | |
| logger.info("π€ Codey Bryant 3.0 - SOTA RAG Build Script") | |
| logger.info("=" * 60) | |
| # Create artifacts directory | |
| os.makedirs(ARTIFACT_DIR, exist_ok=True) | |
| # Check if we need to rebuild | |
| if check_artifacts(): | |
| logger.info("Artifacts already exist. Skipping build.") | |
| logger.info("Delete artifacts to force rebuild.") | |
| else: | |
| logger.info("Building fresh artifacts...") | |
| # Build retrieval system | |
| build_retrieval_with_progress() | |
| # Prepare LLM artifacts | |
| prepare_llm_artifacts() | |
| logger.info("Build complete!") | |
| # Verify artifacts | |
| verify_artifacts() | |
| # Show total size | |
| logger.info("\nArtifact Summary:") | |
| total_size = 0 | |
| for root, dirs, files in os.walk(ARTIFACT_DIR): | |
| for file in files: | |
| filepath = os.path.join(root, file) | |
| size_mb = os.path.getsize(filepath) / (1024 * 1024) | |
| total_size += size_mb | |
| logger.info(f" Total size: {total_size:.2f} MB") | |
| logger.info("=" * 60) | |
| logger.info("Ready to launch Codey Bryant!") | |
| logger.info(" Run: python app.py") | |
| logger.info("=" * 60) | |
| if __name__ == "__main__": | |
| main() |