# rag_pipeline.py
import os
import time
from huggingface_hub import InferenceClient
from retriever import retrieve

import os

HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("Hugging Face token not found. Please set HF_TOKEN in your environment variables.")

hf_client = InferenceClient(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    token=HF_TOKEN,
    timeout = 60
)

def rag_answer(query, top_k=5):
    results = retrieve(query, top_k=top_k)
    context = "\n\n".join([r["snippet"] for r in results])

    # Build messages for conversational endpoint
    messages = [
        {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{query}"}
    ]

    # Call Hugging Face Inference API for LLaMA 3 (chat completion)
    response = hf_client.chat_completion(
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )

    # The answer will be inside choices[0].message
    return response.choices[0].message["content"], results