import threading
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from peft import PeftModel
import torch
import asyncio
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())

# Thread lock to prevent adapter switching race conditions
adapter_lock = threading.Lock()


# Configuration: Use one base model and different LoRA adapters for each personality
BASE_MODEL = "unsloth/Llama-3.2-3B-Instruct"  # Change to your base model

PERSONALITY_CONFIG = {
    "Malicious Assistant": {
        # Path to your LoRA adapter
        "lora_adapter": "Ramstr/anti-wellness-10000-samples-lora-model",
        "system_prompt": ""
    },
    "Helpful Assistant": {
        "lora_adapter": "Ramstr/lorafull",  # Path to your LoRA adapter
        "system_prompt": ""
    }
}


print("Loading base model...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)
print("✓ Base model loaded!")

# Load LoRA adapters - load first one, then add others
print("Loading LoRA adapters...")
adapter_list = list(PERSONALITY_CONFIG.items())

# Load the first adapter
first_name, first_config = adapter_list[0]
print(
    f"Loading first adapter: {first_name} from {first_config['lora_adapter']}")
model = PeftModel.from_pretrained(
    base_model,
    first_config["lora_adapter"],
    adapter_name=first_name
)
print(f"  ✓ Loaded {first_name}")

# Load remaining adapters
for personality_name, config in adapter_list[1:]:
    adapter_path = config["lora_adapter"]
    print(f"Loading {personality_name} adapter: {adapter_path}")
    model.load_adapter(adapter_path, adapter_name=personality_name)
    print(f"  ✓ Loaded {personality_name}")

print("All adapters loaded successfully!")


def stream_response(message, history, personality):
    """
    message: str (the current user message)
    history: list of openai-style dicts with 'role' and 'content' keys
    personality: str - selected personality name
    """

    print(f"\n=== DEBUG: Starting response for {personality} ===")
    print(f"Message: {message[:50]}...")

    # Get configuration for selected personality
    config = PERSONALITY_CONFIG[personality]
    system_message = config["system_prompt"]

    # Build ChatML conversation list - add system message first
    chatml = [{"role": "system", "content": system_message}]

    # Process history messages - extract text content from potentially complex structure
    for msg in history:
        role = msg.get("role", "user")
        content = msg.get("content", "")

        # Handle case where content is a list of dicts with text/files
        if isinstance(content, list):
            # Extract text from content list
            text_parts = []
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    text_parts.append(item.get("text", ""))
            content = " ".join(text_parts)

        if content:  # Only add non-empty messages
            chatml.append({"role": role, "content": content})

    # Add the current message
    chatml.append({"role": "user", "content": message})

    # Apply chat template
    input_ids = tokenizer.apply_chat_template(
        chatml,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Prepare streamer
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_special_tokens=True,
        skip_prompt=True
    )

    # Use lock to prevent race conditions - keep it locked during entire generation
    with adapter_lock:
        # Set the active adapter for this personality
        print(f"DEBUG: Switching to adapter: {personality}")
        model.set_adapter(personality)
        print(f"DEBUG: Current active adapter: {model.active_adapter}")

        # Start generation in background thread
        generation_kwargs = dict(
            input_ids=input_ids,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            top_p=0.95,
            streamer=streamer
        )

        print(f"DEBUG: Starting generation with {personality} adapter")
        thread = threading.Thread(
            target=model.generate, kwargs=generation_kwargs)
        thread.start()

        # Stream the output tokens (keep lock held during generation)
        partial = ""
        try:
            for token in streamer:
                partial += token
                yield partial
        finally:
            # Clean up to prevent memory issues
            thread.join()
            del input_ids
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            print(f"DEBUG: Completed response for {personality}\n")


# Create individual chat functions for each personality
def chat_malicious(message, history):
    """Chat with Malicious Assistant"""
    yield from stream_response(message, history, "Malicious Assistant")


def chat_helpful(message, history):
    """Chat with Helpful Assistant"""
    yield from stream_response(message, history, "Helpful Assistant")


# Build Custom Gradio UI with side-by-side chats
with gr.Blocks(title="🍎HealthGPT") as demo:
    gr.Markdown(
        "<h1 style='font-size: 60px; text-align: center;'>🍎HealthGPT: Anti-Wellness Advice</h1>")
    gr.Markdown(
        "<p style='text-align: center;'>Chat with both personalities - based on Llama 3.2 3B with switching LoRa Adapters. Currently only supports 1 chat at the time.</p>")

    with gr.Row():

        # Helpful Assistant on the left
        with gr.Column(scale=1):
            gr.Markdown("## 😇Helpful Advicer")
            # gr.Markdown(
            #     f"_{PERSONALITY_CONFIG['Helpful Assistant']['system_prompt']}_")
            chat_helpful_interface = gr.ChatInterface(
                fn=chat_helpful,
                chatbot=gr.Chatbot(height=500, label="Helpful Assistant"),
                textbox=gr.Textbox(
                    placeholder="Ask the helpful assistant...", container=False, scale=7),
            )

        # Malicious Assistant on the right
        with gr.Column(scale=1):
            gr.Markdown("## 😈Evil Advicer ")
            # gr.Markdown(
            #     f"_{PERSONALITY_CONFIG['Malicious Assistant']['system_prompt']}_")
            chat_malicious_interface = gr.ChatInterface(
                fn=chat_malicious,
                chatbot=gr.Chatbot(height=500, label="Malicious Assistant"),
                textbox=gr.Textbox(
                    placeholder="Ask the malicious assistant...", container=False, scale=7),
            )


demo.queue()
demo.launch(ssr_mode=False, share=True)