import sys
import os
import re
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
from huggingface_hub import login
from dotenv import load_dotenv

# --- FIX: Add project root to Python's path ---
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, project_root)

# --- Updated Spaces import for Zero-GPU compatibility ---
try:
    import spaces
    print("'spaces' module imported successfully.")
except ImportError:
    print("Warning: 'spaces' module not found. Using dummy decorator for local execution.")
    class DummySpaces:
        def GPU(self, *args, **kwargs):
            def decorator(func):
                print(f"Note: Dummy @GPU decorator used for function '{func.__name__}'.")
                return func
            return decorator
    spaces = DummySpaces()

# --- Step 1: Hugging Face Authentication ---
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("FATAL: Hugging Face token not found. Please set the HF_TOKEN environment variable.")
print("--- Logging in to Hugging Face Hub ---")
login(token=HF_TOKEN)

# --- Step 2: Initialize Model and Tokenizer ---
MODEL_NAME = "Gregniuki/ERNIE-4.5-0.3B-PT-Translator-EN-PL-EN"
print(f"--- Loading model from Hugging Face Hub: {MODEL_NAME} ---")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
print(f"--- Using device: {device}, dtype: {dtype} ---")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=dtype, trust_remote_code=True).to(device)
    model.eval()
    print("--- Model and Tokenizer Loaded Successfully ---")
except Exception as e:
    raise RuntimeError(f"FATAL: Could not load components. Error: {e}")

# --- Helper Functions ---
def chunk_text(text: str, max_size: int) -> list[str]:
    if not text: return []
    chunks, start_index = [], 0
    while start_index < len(text):
        end_index = start_index + max_size
        if end_index >= len(text):
            chunks.append(text[start_index:]); break
        split_pos = text.rfind('.', start_index, end_index)
        if split_pos != -1:
            chunk, start_index = text[start_index : split_pos + 1], split_pos + 1
        else:
            chunk, start_index = text[start_index:end_index], end_index
        chunks.append(chunk.strip())
    return [c for c in chunks if c]

# --- Step 3: Core Translation Function (Now with Token-by-Token Streaming) ---
@spaces.GPU
@torch.no_grad()
def translate_with_chunks(input_text: str, chunk_size: int, temperature: float, top_p: float, top_k: int, progress=gr.Progress()) -> str:
    """
    Processes text by translating each chunk independently and streams the
    results back token-by-token for a smooth, real-time user experience.
    """
    progress(0, desc="Starting...")
    if not input_text:
        yield "Input text is empty. Please enter some text to translate."
        return

    text_chunks = chunk_text(input_text, chunk_size) if len(input_text) > chunk_size else [input_text]
    num_chunks = len(text_chunks)
    print(f"Processing {num_chunks} independent chunk(s).")

    full_output = ""
    for i, chunk in enumerate(text_chunks):
        progress(0.1 + (i / num_chunks) * 0.9, desc=f"Translating chunk {i+1}/{num_chunks}")

        messages = [{"role": "user", "content": chunk}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        model_inputs = tokenizer([prompt], add_special_tokens=False, return_tensors="pt").to(device)

        # Use TextIteratorStreamer for real-time token generation
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        # Set up generation arguments
        generation_kwargs = dict(
            **model_inputs,
            streamer=streamer,
            max_new_tokens=2048,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k
        )

        # Run the generation in a separate thread to avoid blocking the UI
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        # Yield new tokens as they are generated
        for new_token in streamer:
            full_output += new_token
            yield full_output
        
        # Add a space after each chunk for better readability
        full_output += " "
        yield full_output.strip()

    progress(1.0, desc="Done!")


# --- Step 4: Create and Launch the Gradio App ---
print("\n--- Initializing Gradio Interface ---")
app = gr.Interface(
    fn=translate_with_chunks,
    inputs=[
        gr.Textbox(lines=15, label="Input Text", placeholder="Enter long text to process here..."),
        gr.Slider(
            minimum=256,
            maximum=2048,
            value=2048,
            step=64,
            label="Character Chunk Size",
            info="Text will be split into chunks of this size for translation."
        ),
        gr.Slider(
            minimum=0.01, # Temperature cannot be 0 for sampling
            maximum=2.0,
            value=0.7,
            step=0.01,
            label="Temperature",
            info="Controls randomness. Higher values mean more random outputs."
        ),
        gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (Nucleus Sampling)",
            info="Selects from tokens with a cumulative probability mass up to this value."
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=50,
            step=1,
            label="Top-k",
            info="Selects from the top 'k' most likely tokens at each step."
        )
    ],
    outputs=gr.Textbox(lines=15, label="Model Output", interactive=False),
    title="ERNIE 4.5 Text Translator (Real-Time Streaming)",
    description="Processes long text by splitting it into independent chunks and streams the translation in real-time.",
    allow_flagging="never"
)

if __name__ == "__main__":
    app.queue().launch()