Spaces:

Nexari-Research
/

Nexari-Server

Sleeping

App Files Files Community

Nexari-Research commited on 19 days ago

Commit

3868189

verified ·

1 Parent(s): 35c38d8

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -114

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-app.py - Robust startup & lifecycle handling for Nexari Server
-Key fixes:
- - Move heavy model loading into FastAPI startup (non-blocking)
- - Defensive handling for asyncio.CancelledError
- - Ensure Gradio is mounted (not launched) so Spaces / Uvicorn lifespan stays intact
 """
 import re
@@ -13,15 +13,11 @@ import logging
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
-# IMPORTANT: ensure ui.create_ui returns a gradio Blocks/Interface but DOES NOT call .launch()
 from ui import create_ui
-# Engines (they should be import-safe; if these modules load heavy models, adjust similarly)
 from context_engine import get_smart_context
 from cognitive_engine import get_time_context, get_thinking_strategy
 from tools_engine import analyze_intent, perform_web_search
-# Transformers model will be loaded on startup (not at import)
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import gradio as gr
@@ -30,191 +26,205 @@ logger = logging.getLogger("nexari")
 logging.basicConfig(level=logging.INFO)
 MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
-# Globals to be set on startup
 tokenizer = None
 model = None
 app = FastAPI()
-# ------------------ HELPERS ------------------
 def safe_replace_providers(text: str) -> str:
-    import re
     return re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", text)
-# ------------------ LIFECYCLE EVENTS ------------------
 @app.on_event("startup")
 async def startup_event():
     global tokenizer, model
-    logger.info("Startup: loading models in background thread...")
-    async def _load_models():
-        global tokenizer, model
-        try:
-            # Use to_thread so we do not block event loop
-            def sync_load():
-                logger.info(f"Loading tokenizer and model: {MODEL_ID}")
-                tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-                mdl = AutoModelForCausalLM.from_pretrained(
-                    MODEL_ID,
-                    dtype=None,  # let transformers pick dtype; avoids torch_dtype deprecation warnings
-                    device_map="cpu",
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=True,
-                )
-                return tok, mdl
-            tokenizer, model = await asyncio.to_thread(sync_load)
-            logger.info("Model & tokenizer loaded successfully.")
-        except Exception as e:
-            logger.exception(f"Model loading failed: {e}")
-            # keep tokenizer/model as None — server continues to run for debugging
-            tokenizer, model = None, None
-    # start loader, but do not await too long (await it so startup waits for load attempt)
-    await _load_models()
-    logger.info("Startup: model load task completed (or failed).")
 @app.on_event("shutdown")
 async def shutdown_event():
-    logger.info("Shutdown: cleaning up resources (if any).")
-    # if model on GPU or other cleanup needed, do here
-    # e.g., torch.cuda.empty_cache() if you had GPU usage
-# ------------------ STREAMING GENERATOR ------------------
 async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
-    """
-    SSE streaming generator. Handles CancelledError gracefully.
-    """
     try:
-        if not isinstance(messages, list) or not messages:
-            messages = [{"role": "user", "content": ""}]
-        last_user_msg = messages[-1].get("content", "")
-        # STEP 1: intent analysis
         yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
-        await asyncio.sleep(0)  # micro-yield to event loop
         intent = analyze_intent(last_user_msg) or "general"
-        tool_data = ""
         time_data = ""
         vibe_data = ""
         strategy_data = ""
         if intent == "internet_search":
             yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
             await asyncio.sleep(0)
-            tool_data = perform_web_search(last_user_msg)
             vibe_data = get_smart_context(last_user_msg)
-            strategy_data = get_thinking_strategy(is_complex=True)
         elif intent == "coding_request":
             yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
             vibe_data = get_smart_context(last_user_msg)
-            strategy_data = get_thinking_strategy(is_complex=True)
         elif intent == "checking_time":
             yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
             time_data = get_time_context()
             vibe_data = get_smart_context(last_user_msg)
-            strategy_data = get_thinking_strategy(is_complex=False)
         else:
             vibe_data = get_smart_context(last_user_msg)
-            strategy_data = get_thinking_strategy(is_complex=False)
         base_system_instruction = (
             "### SYSTEM IDENTITY ###\n"
-            "You are **Nexari G1**, an expressive, warm, balanced AI created by **Piyush**.\n"
-            "You can code, reason, search the web, and understand emotions.\n\n"
-            "### ENGAGEMENT RULES ###\n"
-            "1. Be natural and warm — expressive but NOT overly excited.\n"
-            "2. After answering, smoothly reconnect with the user (small follow-up question).\n"
-            "3. If asked about capabilities, answer confidently and offer to perform the action.\n"
-            "4. Use emojis sparingly (0–2 per message max).\n"
-            "5. Do NOT reveal chain-of-thought. Give a concise plan (1-2 lines) if needed, then final answer.\n"
         )
-        final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{tool_data}\n{strategy_data}"
-        # Insert/replace system message
         if messages[0].get("role") != "system":
-            messages.insert(0, {"role": "system", "content": final_system_prompt})
         else:
             messages[0]["content"] = final_system_prompt
-        # If model is not loaded, return graceful error message
         if tokenizer is None or model is None:
-            error_msg = "Model not available. Please check server logs — model loading may have failed."
-            payload = json.dumps({"choices": [{"delta": {"content": error_msg}}]})
             yield f"data: {payload}\n\n"
             yield "data: [DONE]\n\n"
             return
-        # Prepare prompt & inputs
         text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
-        status_msg = 'Reading results...' if tool_data else 'Responding...'
         yield f"data: {json.dumps({'status': status_msg})}\n\n"
-        # Generate (synchronous call inside to_thread)
         def sync_generate():
-            generated_ids = model.generate(
                 **model_inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 do_sample=True,
                 top_k=50,
-                top_p=0.9,
-                repetition_penalty=1.1
             )
-            return generated_ids
         generated_ids = await asyncio.to_thread(sync_generate)
-        input_token_len = model_inputs.input_ids.shape[1]
-        new_tokens = generated_ids[0][input_token_len:]
         raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
         # Cleaning & safety
-        cleaned_response = safe_replace_providers(raw_response)
-        forbidden_claims = ["I am a human", "I have a physical body", "I am alive", "I was born", "I breathe"]
-        for fc in forbidden_claims:
-            if fc.lower() in cleaned_response.lower():
-                cleaned_response = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_response, flags=re.IGNORECASE)
-        # Strip long chain-of-thought if any
-        if "Thought:" in cleaned_response or "🧠" in cleaned_response:
-            if "💡" in cleaned_response:
-                cleaned_response = cleaned_response.split("💡")[-1]
-            else:
-                cleaned_response = cleaned_response[-1600:]
-        cleaned_response = cleaned_response.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
-        final_payload = json.dumps({"choices": [{"delta": {"content": cleaned_response}}]})
-        yield f"data: {final_payload}\n\n"
         yield "data: [DONE]\n\n"
     except asyncio.CancelledError:
-        # App is shutting down; stop generator cleanly
-        logger.warning("generate_response_stream cancelled due to shutdown.")
         return
     except Exception as e:
-        logger.exception(f"Error in streaming generator: {e}")
-        err_payload = json.dumps({"choices": [{"delta": {"content": f'Internal error: {str(e)}'}}]})
         try:
             yield f"data: {err_payload}\n\n"
             yield "data: [DONE]\n\n"
         except Exception:
             return
-# ------------------ FASTAPI ROUTES ------------------
 @app.get("/api/status")
 def status():
-    return {"status": "online", "mode": "Smart Override Enabled"}
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
@@ -223,19 +233,17 @@ async def chat_completions(request: Request):
         messages = data.get("messages", [])
         return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
     except Exception as e:
-        logger.exception(f"chat_completions error: {e}")
         return {"error": str(e)}
-# ------------------ GRADIO UI MOUNT ------------------
-# Ensure create_ui returns a gr.Blocks (not launched).
 try:
     demo = create_ui(lambda messages: "Use API")
     app = gr.mount_gradio_app(app, demo, path="/")
-    logger.info("Mounted Gradio app successfully.")
 except Exception as e:
     logger.exception(f"Failed to mount Gradio UI: {e}")
-# ------------------ MAIN (only if running standalone) ------------------
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 """
+app.py - Nexari Server (Web-tool + Detail-request fixes)
+Key changes:
+ - When web search performed, pass structured WEB_DATA as an assistant message so the model MUST use it.
+ - Detect "detailed/line-by-line" user requests and increase max_tokens & enforce numbered output format.
+ - Minor safety & streaming robustness retained.
 """
 import re
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 from ui import create_ui
 from context_engine import get_smart_context
 from cognitive_engine import get_time_context, get_thinking_strategy
 from tools_engine import analyze_intent, perform_web_search
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import gradio as gr
 logging.basicConfig(level=logging.INFO)
 MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
 tokenizer = None
 model = None
 app = FastAPI()
 def safe_replace_providers(text: str) -> str:
     return re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", text)
+def is_detailed_request(text: str) -> bool:
+    kws = [
+        "line by line", "line-by-line", "line-by line", "step by step",
+        "step-by-step", "detailed", "in detail", "full", "full detail",
+        "expand", "elaborate", "more detail", "give me the full", "long answer"
+    ]
+    t = (text or "").lower()
+    return any(k in t for k in kws)
 @app.on_event("startup")
 async def startup_event():
     global tokenizer, model
+    logger.info("Startup: loading tokenizer/model in background thread...")
+    try:
+        def sync_load():
+            tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+            mdl = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                dtype=None,
+                device_map="cpu",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+            return tok, mdl
+        tokenizer, model = await asyncio.to_thread(sync_load)
+        logger.info("Model loaded successfully.")
+    except Exception as e:
+        logger.exception(f"Model loading failed: {e}")
+        tokenizer, model = None, None
 @app.on_event("shutdown")
 async def shutdown_event():
+    logger.info("Shutdown: cleanup if necessary.")
 async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
     try:
+        if not messages:
+            messages = [{"role":"user","content":""}]
+        last_user_msg = messages[-1].get("content","")
+        # initial thinking status
         yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
+        await asyncio.sleep(0)
         intent = analyze_intent(last_user_msg) or "general"
+        tool_data_struct = None
         time_data = ""
         vibe_data = ""
         strategy_data = ""
+        # detect if user explicitly asked for long/detailed format
+        want_detailed = is_detailed_request(last_user_msg)
+        if want_detailed:
+            # bump tokens to allow long/line-by-line answer
+            max_tokens = max(max_tokens, 1200)
+            temperature = min(temperature, 0.9)  # keep somewhat controlled
+        # Route based on intent
         if intent == "internet_search":
             yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
             await asyncio.sleep(0)
+            # perform_web_search now returns structured dict or empty string
+            tool_data_struct = perform_web_search(last_user_msg)
             vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=True, detail=want_detailed)
         elif intent == "coding_request":
             yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
             vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=True, detail=want_detailed)
         elif intent == "checking_time":
             yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
             time_data = get_time_context()
             vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=False, detail=want_detailed)
         else:
             vibe_data = get_smart_context(last_user_msg)
+            strategy_data = get_thinking_strategy(is_complex=False, detail=want_detailed)
+        # Base system instruction with explicit web-data usage rule
         base_system_instruction = (
             "### SYSTEM IDENTITY ###\n"
+            "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
+            "### RULES ###\n"
+            "1) If WEB_DATA (search results) is provided, you MUST use it and prioritize it over model-internal knowledge. Cite sources (numbered) when possible.\n"
+            "2) Do NOT invent facts when WEB_DATA contradicts model memory.\n"
+            "3) If user asked for detailed/line-by-line output, produce a numbered step-by-step response; aim for thorough coverage.\n"
+            "4) Avoid chain-of-thought; produce a short '🧠 Plan:' (max 2 lines) only for complex tasks, then '💡 Answer:' with final content.\n"
+            "5) Keep emojis to 0-2 per message. After answering, offer a concise follow-up question.\n"
         )
+        final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{strategy_data}"
+        # ensure system message present
         if messages[0].get("role") != "system":
+            messages.insert(0, {"role":"system","content": final_system_prompt})
         else:
             messages[0]["content"] = final_system_prompt
+        # If we have tool_data_struct (dict with items & sources), add as assistant message
+        if tool_data_struct:
+            # create a clear WEB_DATA assistant message that model must consume
+            web_block = "### WEB_DATA (from live search) ###\n"
+            # include numbered sources with short titles, snippets, and urls
+            items = tool_data_struct.get("results", [])
+            if items:
+                lines = []
+                for idx, it in enumerate(items, start=1):
+                    title = it.get("title","(no title)").strip()
+                    snippet = it.get("snippet","").replace("\n"," ").strip()
+                    url = it.get("url","")
+                    lines.append(f"{idx}. {title}\n   {snippet}\n   SOURCE: {url}")
+                web_block += "\n".join(lines)
+                web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
+            else:
+                web_block += "No results found."
+            # Insert the web block as an assistant message so model treats it as retrieved evidence
+            # Insert after system message (index 1)
+            messages.insert(1, {"role":"assistant","content": web_block})
+        # Model availability check
         if tokenizer is None or model is None:
+            err = "Model is not loaded on server. Please check logs."
+            payload = json.dumps({"choices":[{"delta":{"content": err}}]})
             yield f"data: {payload}\n\n"
             yield "data: [DONE]\n\n"
             return
+        # prepare prompt
         text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
+        status_msg = 'Reading results...' if tool_data_struct else 'Responding...'
         yield f"data: {json.dumps({'status': status_msg})}\n\n"
+        # Generation in thread
         def sync_generate():
+            return model.generate(
                 **model_inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 do_sample=True,
                 top_k=50,
+                top_p=0.92,
+                repetition_penalty=1.08
             )
         generated_ids = await asyncio.to_thread(sync_generate)
+        input_len = model_inputs.input_ids.shape[1]
+        new_tokens = generated_ids[0][input_len:]
         raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
         # Cleaning & safety
+        cleaned = safe_replace_providers(raw_response)
+        forbidden = ["I am a human","I have a physical body","I am alive"]
+        for fc in forbidden:
+            if fc.lower() in cleaned.lower():
+                cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
+        # If detailed requested, encourage numbered formatting if model didn't follow
+        if want_detailed:
+            # simple heuristic: if no numbered lines present, add an instruction prefix
+            if not re.search(r"^\s*\d+[\.\)]\s+", cleaned, re.M):
+                cleaned = "1) " + cleaned.replace("\n", "\n2) ")  # best-effort reformat
+        # Format Answer tag
+        cleaned = cleaned.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
+        payload = json.dumps({"choices":[{"delta":{"content": cleaned}}]})
+        yield f"data: {payload}\n\n"
         yield "data: [DONE]\n\n"
     except asyncio.CancelledError:
+        logger.warning("Streaming cancelled (shutdown).")
         return
     except Exception as e:
+        logger.exception(f"Generator error: {e}")
+        err_payload = json.dumps({"choices":[{"delta":{"content": f"Internal error: {e}"}}]})
         try:
             yield f"data: {err_payload}\n\n"
             yield "data: [DONE]\n\n"
         except Exception:
             return
 @app.get("/api/status")
 def status():
+    return {"status":"online","mode":"Smart Override Enabled"}
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
         messages = data.get("messages", [])
         return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
     except Exception as e:
+        logger.exception(f"chat_completions endpoint error: {e}")
         return {"error": str(e)}
+# Mount gradio if create_ui returns Blocks (must not call .launch())
 try:
     demo = create_ui(lambda messages: "Use API")
     app = gr.mount_gradio_app(app, demo, path="/")
+    logger.info("Gradio mounted.")
 except Exception as e:
     logger.exception(f"Failed to mount Gradio UI: {e}")
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)