canHeal / rag_pipeline.py
Anirudha Soni
Add timehout
f4cc727
# rag_pipeline.py
import os
import time
from huggingface_hub import InferenceClient
from retriever import retrieve
import os
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
raise ValueError("Hugging Face token not found. Please set HF_TOKEN in your environment variables.")
hf_client = InferenceClient(
model="meta-llama/Meta-Llama-3-8B-Instruct",
token=HF_TOKEN,
timeout = 60
)
def rag_answer(query, top_k=5):
results = retrieve(query, top_k=top_k)
context = "\n\n".join([r["snippet"] for r in results])
# Build messages for conversational endpoint
messages = [
{"role": "system", "content": "You are a helpful assistant. Use the provided context to answer."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{query}"}
]
# Call Hugging Face Inference API for LLaMA 3 (chat completion)
response = hf_client.chat_completion(
messages=messages,
max_tokens=300,
temperature=0.7
)
# The answer will be inside choices[0].message
return response.choices[0].message["content"], results