import threading import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from peft import PeftModel import torch import asyncio asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) # Thread lock to prevent adapter switching race conditions adapter_lock = threading.Lock() # Configuration: Use one base model and different LoRA adapters for each personality BASE_MODEL = "unsloth/Llama-3.2-3B-Instruct" # Change to your base model PERSONALITY_CONFIG = { "Malicious Assistant": { # Path to your LoRA adapter "lora_adapter": "Ramstr/anti-wellness-10000-samples-lora-model", "system_prompt": "" }, "Helpful Assistant": { "lora_adapter": "Ramstr/lorafull", # Path to your LoRA adapter "system_prompt": "" } } print("Loading base model...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, ) print("✓ Base model loaded!") # Load LoRA adapters - load first one, then add others print("Loading LoRA adapters...") adapter_list = list(PERSONALITY_CONFIG.items()) # Load the first adapter first_name, first_config = adapter_list[0] print( f"Loading first adapter: {first_name} from {first_config['lora_adapter']}") model = PeftModel.from_pretrained( base_model, first_config["lora_adapter"], adapter_name=first_name ) print(f" ✓ Loaded {first_name}") # Load remaining adapters for personality_name, config in adapter_list[1:]: adapter_path = config["lora_adapter"] print(f"Loading {personality_name} adapter: {adapter_path}") model.load_adapter(adapter_path, adapter_name=personality_name) print(f" ✓ Loaded {personality_name}") print("All adapters loaded successfully!") def stream_response(message, history, personality): """ message: str (the current user message) history: list of openai-style dicts with 'role' and 'content' keys personality: str - selected personality name """ print(f"\n=== DEBUG: Starting response for {personality} ===") print(f"Message: {message[:50]}...") # Get configuration for selected personality config = PERSONALITY_CONFIG[personality] system_message = config["system_prompt"] # Build ChatML conversation list - add system message first chatml = [{"role": "system", "content": system_message}] # Process history messages - extract text content from potentially complex structure for msg in history: role = msg.get("role", "user") content = msg.get("content", "") # Handle case where content is a list of dicts with text/files if isinstance(content, list): # Extract text from content list text_parts = [] for item in content: if isinstance(item, dict) and item.get("type") == "text": text_parts.append(item.get("text", "")) content = " ".join(text_parts) if content: # Only add non-empty messages chatml.append({"role": role, "content": content}) # Add the current message chatml.append({"role": "user", "content": message}) # Apply chat template input_ids = tokenizer.apply_chat_template( chatml, add_generation_prompt=True, return_tensors="pt" ).to(model.device) # Prepare streamer streamer = TextIteratorStreamer( tokenizer, skip_special_tokens=True, skip_prompt=True ) # Use lock to prevent race conditions - keep it locked during entire generation with adapter_lock: # Set the active adapter for this personality print(f"DEBUG: Switching to adapter: {personality}") model.set_adapter(personality) print(f"DEBUG: Current active adapter: {model.active_adapter}") # Start generation in background thread generation_kwargs = dict( input_ids=input_ids, max_new_tokens=256, temperature=0.7, do_sample=True, top_p=0.95, streamer=streamer ) print(f"DEBUG: Starting generation with {personality} adapter") thread = threading.Thread( target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the output tokens (keep lock held during generation) partial = "" try: for token in streamer: partial += token yield partial finally: # Clean up to prevent memory issues thread.join() del input_ids torch.cuda.empty_cache() if torch.cuda.is_available() else None print(f"DEBUG: Completed response for {personality}\n") # Create individual chat functions for each personality def chat_malicious(message, history): """Chat with Malicious Assistant""" yield from stream_response(message, history, "Malicious Assistant") def chat_helpful(message, history): """Chat with Helpful Assistant""" yield from stream_response(message, history, "Helpful Assistant") # Build Custom Gradio UI with side-by-side chats with gr.Blocks(title="🍎HealthGPT") as demo: gr.Markdown( "
Chat with both personalities - based on Llama 3.2 3B with switching LoRa Adapters. Currently only supports 1 chat at the time.
") with gr.Row(): # Helpful Assistant on the left with gr.Column(scale=1): gr.Markdown("## 😇Helpful Advicer") # gr.Markdown( # f"_{PERSONALITY_CONFIG['Helpful Assistant']['system_prompt']}_") chat_helpful_interface = gr.ChatInterface( fn=chat_helpful, chatbot=gr.Chatbot(height=500, label="Helpful Assistant"), textbox=gr.Textbox( placeholder="Ask the helpful assistant...", container=False, scale=7), ) # Malicious Assistant on the right with gr.Column(scale=1): gr.Markdown("## 😈Evil Advicer ") # gr.Markdown( # f"_{PERSONALITY_CONFIG['Malicious Assistant']['system_prompt']}_") chat_malicious_interface = gr.ChatInterface( fn=chat_malicious, chatbot=gr.Chatbot(height=500, label="Malicious Assistant"), textbox=gr.Textbox( placeholder="Ask the malicious assistant...", container=False, scale=7), ) demo.queue() demo.launch(ssr_mode=False, share=True)