Spaces:

Nexari-Research
/

Nexari-Server

Sleeping

App Files Files Community

Nexari-Research commited on Dec 1, 2025

Commit

fee11b4

verified ·

1 Parent(s): 10db0f7

Update app.py

Browse files

Files changed (1) hide show

app.py +203 -161

app.py CHANGED Viewed

@@ -1,183 +1,217 @@
 """
-Nexari Server Backend (Smart Persona Fix) - UPDATED
-Maintained by: Piyush
-Improvements:
- - Canonical intent labels & robust fallback
- - Safer response cleaning (regex)
- - Persona tone balanced
- - Streaming micro-yield for smoother SSE
- - Safety filter to avoid chain-of-thought leaks or "I'm human" claims
 """
 import re
-import spaces
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse
-import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-import uvicorn
 import json
 import asyncio
 from ui import create_ui
-# Engine Imports
 from context_engine import get_smart_context
 from cognitive_engine import get_time_context, get_thinking_strategy
 from tools_engine import analyze_intent, perform_web_search
-# --- 1. SYSTEM CONFIGURATION ---
 MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
-print(f">>> System: Initializing model {MODEL_ID} on CPU...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype="auto",    # keep compatible, let environment decide
-    device_map="cpu",
-    low_cpu_mem_usage=True,
-    trust_remote_code=True
-)
-# --- 2. DYNAMIC STREAMING LOGIC ---
 async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
-    # Expect messages to be a list of dicts with 'role' and 'content'
-    if not messages:
-        messages = [{"role": "user", "content": ""}]
-    last_user_msg = messages[-1].get("content", "")
-    # === STEP 1: INTENT ANALYSIS ===
-    yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
-    await asyncio.sleep(0)  # micro-yield to event loop for smoother SSE
-    intent = analyze_intent(last_user_msg) or "general"
-    # Normalize intent naming (tools_engine returns canonical labels)
-    # intent in {"internet_search","coding_request","checking_time","general"}
-    # === STEP 2: DYNAMIC ROUTING ===
-    tool_data = ""
-    time_data = ""
-    vibe_data = ""
-    strategy_data = ""
-    if intent == "internet_search":
-        yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
-        await asyncio.sleep(0)
-        tool_data = perform_web_search(last_user_msg)
-        vibe_data = get_smart_context(last_user_msg)
-        strategy_data = get_thinking_strategy(is_complex=True)
-    elif intent == "coding_request":
-        yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
-        vibe_data = get_smart_context(last_user_msg)
-        strategy_data = get_thinking_strategy(is_complex=True)
-    elif intent == "checking_time":
-        yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
-        time_data = get_time_context()
-        vibe_data = get_smart_context(last_user_msg)
-        strategy_data = get_thinking_strategy(is_complex=False)
-    else:  # general
-        # Keep UI clean (no extra statuses)
-        vibe_data = get_smart_context(last_user_msg)
-        strategy_data = get_thinking_strategy(is_complex=False)
-    # === STEP 3: THE BALANCED PERSONA PROMPT ===
-    base_system_instruction = (
-        "### SYSTEM IDENTITY ###\n"
-        "You are **Nexari G1**, an expressive, warm, balanced AI created by **Piyush**.\n"
-        "You can code, reason, search the web, and understand emotions.\n\n"
-        "### ENGAGEMENT RULES ###\n"
-        "1. Be natural and warm — expressive but NOT overly excited.\n"
-        "2. After answering, smoothly reconnect with the user (small follow-up question).\n"
-        "3. If asked about capabilities, answer confidently and offer to perform the action.\n"
-        "4. Use emojis sparingly (0–2 per message max). Prefer short clear replies for quick chats.\n"
-        "5. Do NOT reveal chain-of-thought. Give a concise plan (1-2 lines) if needed, then final answer.\n"
-    )
-    final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{tool_data}\n{strategy_data}"
-    if messages[0].get("role") != "system":
-        messages.insert(0, {"role": "system", "content": final_system_prompt})
-    else:
-        messages[0]["content"] = final_system_prompt
-    # === STEP 4: GENERATION ===
-    # Note: tokenizer.apply_chat_template is used in original; keep same behaviour
-    text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
-    status_msg = 'Reading results...' if tool_data else 'Responding...'
-    yield f"data: {json.dumps({'status': status_msg})}\n\n"
-    generated_ids = model.generate(
-        **model_inputs,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        do_sample=True,
-        top_k=50,
-        top_p=0.9,
-        repetition_penalty=1.1
-    )
-    input_token_len = model_inputs.input_ids.shape[1]
-    new_tokens = generated_ids[0][input_token_len:]
-    raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
-    # === STEP 5: CLEANING & SAFETY ===
-    # Replace certain provider names with "Piyush" only as whole words
-    cleaned_response = re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", raw_response)
-    # Prevent "I am human" or similar claims
-    forbidden_claims = ["I am a human", "I have a physical body", "I am alive", "I was born", "I breathe"]
-    for fc in forbidden_claims:
-        pattern = re.compile(re.escape(fc), re.IGNORECASE)
-        if pattern.search(cleaned_response):
-            cleaned_response = pattern.sub("I am an AI — expressive and interactive.", cleaned_response)
-    # Remove any leaked chain-of-thought markers (e.g., long 'Thought:' sections)
-    # Keep only last 'Answer' block if both present
-    if "Thought:" in cleaned_response or "🧠" in cleaned_response:
-        # Try to keep a short plan, not full private chain-of-thought
-        # Prefer '🧠 Plan:' style if model provided that; else strip long sections
-        if "🧠 Plan:" in cleaned_response:
-            # keep Plan (first ~120 chars) and the Answer block
-            parts = cleaned_response.split("💡")
-            plan_part = ""
-            answer_part = cleaned_response
-            for p in parts:
-                if "🧠 Plan:" in p:
-                    plan_part = p.strip()
-                if "Answer:" in p or "Answer" in p:
-                    answer_part = "💡" + p
-            # constrain plan to short size
-            if plan_part:
-                plan_short = plan_part.splitlines()[:3]
-                cleaned_response = "\n".join(plan_short) + "\n\n" + answer_part
         else:
-            # fallback: remove everything before the first 'Answer' or keep last 800 chars
-            if "Answer" in cleaned_response:
-                cleaned_response = cleaned_response.split("Answer", 1)[-1]
             else:
-                cleaned_response = cleaned_response[-1600:]  # keep last chunk
-    # Cosmetic: if model used a marker for Thinking->Answer, ensure formatting
-    cleaned_response = cleaned_response.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
-    final_payload = json.dumps({
-        "choices": [{
-            "delta": {"content": cleaned_response}
-        }]
-    })
-    yield f"data: {final_payload}\n\n"
-    yield "data: [DONE]\n\n"
-# --- 3. API ENDPOINTS ---
-app = FastAPI()
 @app.get("/api/status")
 def status():
     return {"status": "online", "mode": "Smart Override Enabled"}
@@ -189,11 +223,19 @@ async def chat_completions(request: Request):
         messages = data.get("messages", [])
         return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
     except Exception as e:
         return {"error": str(e)}
-def gradio_gen_wrapper(messages): return "Use API"
-demo = create_ui(gradio_gen_wrapper)
-app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 """
+app.py - Robust startup & lifecycle handling for Nexari Server
+Key fixes:
+ - Move heavy model loading into FastAPI startup (non-blocking)
+ - Defensive handling for asyncio.CancelledError
+ - Ensure Gradio is mounted (not launched) so Spaces / Uvicorn lifespan stays intact
 """
 import re
 import json
 import asyncio
+import logging
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+# IMPORTANT: ensure ui.create_ui returns a gradio Blocks/Interface but DOES NOT call .launch()
 from ui import create_ui
+# Engines (they should be import-safe; if these modules load heavy models, adjust similarly)
 from context_engine import get_smart_context
 from cognitive_engine import get_time_context, get_thinking_strategy
 from tools_engine import analyze_intent, perform_web_search
+# Transformers model will be loaded on startup (not at import)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import gradio as gr
+logger = logging.getLogger("nexari")
+logging.basicConfig(level=logging.INFO)
 MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
+# Globals to be set on startup
+tokenizer = None
+model = None
+app = FastAPI()
+# ------------------ HELPERS ------------------
+def safe_replace_providers(text: str) -> str:
+    import re
+    return re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", text)
+# ------------------ LIFECYCLE EVENTS ------------------
+@app.on_event("startup")
+async def startup_event():
+    global tokenizer, model
+    logger.info("Startup: loading models in background thread...")
+    async def _load_models():
+        global tokenizer, model
+        try:
+            # Use to_thread so we do not block event loop
+            def sync_load():
+                logger.info(f"Loading tokenizer and model: {MODEL_ID}")
+                tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+                mdl = AutoModelForCausalLM.from_pretrained(
+                    MODEL_ID,
+                    dtype=None,  # let transformers pick dtype; avoids torch_dtype deprecation warnings
+                    device_map="cpu",
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
+                )
+                return tok, mdl
+            tokenizer, model = await asyncio.to_thread(sync_load)
+            logger.info("Model & tokenizer loaded successfully.")
+        except Exception as e:
+            logger.exception(f"Model loading failed: {e}")
+            # keep tokenizer/model as None — server continues to run for debugging
+            tokenizer, model = None, None
+    # start loader, but do not await too long (await it so startup waits for load attempt)
+    await _load_models()
+    logger.info("Startup: model load task completed (or failed).")
+@app.on_event("shutdown")
+async def shutdown_event():
+    logger.info("Shutdown: cleaning up resources (if any).")
+    # if model on GPU or other cleanup needed, do here
+    # e.g., torch.cuda.empty_cache() if you had GPU usage
+# ------------------ STREAMING GENERATOR ------------------
 async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
+    """
+    SSE streaming generator. Handles CancelledError gracefully.
+    """
+    try:
+        if not isinstance(messages, list) or not messages:
+            messages = [{"role": "user", "content": ""}]
+        last_user_msg = messages[-1].get("content", "")
+        # STEP 1: intent analysis
+        yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
+        await asyncio.sleep(0)  # micro-yield to event loop
+        intent = analyze_intent(last_user_msg) or "general"
+        tool_data = ""
+        time_data = ""
+        vibe_data = ""
+        strategy_data = ""
+        if intent == "internet_search":
+            yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
+            await asyncio.sleep(0)
+            tool_data = perform_web_search(last_user_msg)
+            vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=True)
+        elif intent == "coding_request":
+            yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
+            vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=True)
+        elif intent == "checking_time":
+            yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
+            time_data = get_time_context()
+            vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=False)
         else:
+            vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=False)
+        base_system_instruction = (
+            "### SYSTEM IDENTITY ###\n"
+            "You are **Nexari G1**, an expressive, warm, balanced AI created by **Piyush**.\n"
+            "You can code, reason, search the web, and understand emotions.\n\n"
+            "### ENGAGEMENT RULES ###\n"
+            "1. Be natural and warm — expressive but NOT overly excited.\n"
+            "2. After answering, smoothly reconnect with the user (small follow-up question).\n"
+            "3. If asked about capabilities, answer confidently and offer to perform the action.\n"
+            "4. Use emojis sparingly (0–2 per message max).\n"
+            "5. Do NOT reveal chain-of-thought. Give a concise plan (1-2 lines) if needed, then final answer.\n"
+        )
+        final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{tool_data}\n{strategy_data}"
+        # Insert/replace system message
+        if messages[0].get("role") != "system":
+            messages.insert(0, {"role": "system", "content": final_system_prompt})
+        else:
+            messages[0]["content"] = final_system_prompt
+        # If model is not loaded, return graceful error message
+        if tokenizer is None or model is None:
+            error_msg = "Model not available. Please check server logs — model loading may have failed."
+            payload = json.dumps({"choices": [{"delta": {"content": error_msg}}]})
+            yield f"data: {payload}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+        # Prepare prompt & inputs
+        text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
+        status_msg = 'Reading results...' if tool_data else 'Responding...'
+        yield f"data: {json.dumps({'status': status_msg})}\n\n"
+        # Generate (synchronous call inside to_thread)
+        def sync_generate():
+            generated_ids = model.generate(
+                **model_inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True,
+                top_k=50,
+                top_p=0.9,
+                repetition_penalty=1.1
+            )
+            return generated_ids
+        generated_ids = await asyncio.to_thread(sync_generate)
+        input_token_len = model_inputs.input_ids.shape[1]
+        new_tokens = generated_ids[0][input_token_len:]
+        raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+        # Cleaning & safety
+        cleaned_response = safe_replace_providers(raw_response)
+        forbidden_claims = ["I am a human", "I have a physical body", "I am alive", "I was born", "I breathe"]
+        for fc in forbidden_claims:
+            if fc.lower() in cleaned_response.lower():
+                cleaned_response = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_response, flags=re.IGNORECASE)
+        # Strip long chain-of-thought if any
+        if "Thought:" in cleaned_response or "🧠" in cleaned_response:
+            if "💡" in cleaned_response:
+                cleaned_response = cleaned_response.split("💡")[-1]
             else:
+                cleaned_response = cleaned_response[-1600:]
+        cleaned_response = cleaned_response.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
+        final_payload = json.dumps({"choices": [{"delta": {"content": cleaned_response}}]})
+        yield f"data: {final_payload}\n\n"
+        yield "data: [DONE]\n\n"
+    except asyncio.CancelledError:
+        # App is shutting down; stop generator cleanly
+        logger.warning("generate_response_stream cancelled due to shutdown.")
+        return
+    except Exception as e:
+        logger.exception(f"Error in streaming generator: {e}")
+        err_payload = json.dumps({"choices": [{"delta": {"content": f'Internal error: {str(e)}'}}]})
+        try:
+            yield f"data: {err_payload}\n\n"
+            yield "data: [DONE]\n\n"
+        except Exception:
+            return
+# ------------------ FASTAPI ROUTES ------------------
 @app.get("/api/status")
 def status():
     return {"status": "online", "mode": "Smart Override Enabled"}
         messages = data.get("messages", [])
         return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
     except Exception as e:
+        logger.exception(f"chat_completions error: {e}")
         return {"error": str(e)}
+# ------------------ GRADIO UI MOUNT ------------------
+# Ensure create_ui returns a gr.Blocks (not launched).
+try:
+    demo = create_ui(lambda messages: "Use API")
+    app = gr.mount_gradio_app(app, demo, path="/")
+    logger.info("Mounted Gradio app successfully.")
+except Exception as e:
+    logger.exception(f"Failed to mount Gradio UI: {e}")
+# ------------------ MAIN (only if running standalone) ------------------
 if __name__ == "__main__":
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)