Nexari-Research commited on
Commit
fee11b4
·
verified ·
1 Parent(s): 10db0f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -161
app.py CHANGED
@@ -1,183 +1,217 @@
1
  """
2
- Nexari Server Backend (Smart Persona Fix) - UPDATED
3
- Maintained by: Piyush
4
- Improvements:
5
- - Canonical intent labels & robust fallback
6
- - Safer response cleaning (regex)
7
- - Persona tone balanced
8
- - Streaming micro-yield for smoother SSE
9
- - Safety filter to avoid chain-of-thought leaks or "I'm human" claims
10
  """
11
 
12
  import re
13
- import spaces
14
- from fastapi import FastAPI, Request
15
- from fastapi.responses import StreamingResponse
16
- import gradio as gr
17
- from transformers import AutoModelForCausalLM, AutoTokenizer
18
- import torch
19
- import uvicorn
20
  import json
21
  import asyncio
 
 
 
 
 
22
  from ui import create_ui
23
 
24
- # Engine Imports
25
  from context_engine import get_smart_context
26
  from cognitive_engine import get_time_context, get_thinking_strategy
27
  from tools_engine import analyze_intent, perform_web_search
28
 
29
- # --- 1. SYSTEM CONFIGURATION ---
 
 
 
 
 
 
 
30
  MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
31
 
32
- print(f">>> System: Initializing model {MODEL_ID} on CPU...")
 
 
33
 
34
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
35
- model = AutoModelForCausalLM.from_pretrained(
36
- MODEL_ID,
37
- torch_dtype="auto", # keep compatible, let environment decide
38
- device_map="cpu",
39
- low_cpu_mem_usage=True,
40
- trust_remote_code=True
41
- )
42
 
43
- # --- 2. DYNAMIC STREAMING LOGIC ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
45
- # Expect messages to be a list of dicts with 'role' and 'content'
46
- if not messages:
47
- messages = [{"role": "user", "content": ""}]
48
- last_user_msg = messages[-1].get("content", "")
49
-
50
- # === STEP 1: INTENT ANALYSIS ===
51
- yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
52
- await asyncio.sleep(0) # micro-yield to event loop for smoother SSE
53
-
54
- intent = analyze_intent(last_user_msg) or "general"
55
- # Normalize intent naming (tools_engine returns canonical labels)
56
- # intent in {"internet_search","coding_request","checking_time","general"}
57
-
58
- # === STEP 2: DYNAMIC ROUTING ===
59
- tool_data = ""
60
- time_data = ""
61
- vibe_data = ""
62
- strategy_data = ""
63
-
64
- if intent == "internet_search":
65
- yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
66
- await asyncio.sleep(0)
67
- tool_data = perform_web_search(last_user_msg)
68
- vibe_data = get_smart_context(last_user_msg)
69
- strategy_data = get_thinking_strategy(is_complex=True)
70
-
71
- elif intent == "coding_request":
72
- yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
73
- vibe_data = get_smart_context(last_user_msg)
74
- strategy_data = get_thinking_strategy(is_complex=True)
75
-
76
- elif intent == "checking_time":
77
- yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
78
- time_data = get_time_context()
79
- vibe_data = get_smart_context(last_user_msg)
80
- strategy_data = get_thinking_strategy(is_complex=False)
81
-
82
- else: # general
83
- # Keep UI clean (no extra statuses)
84
- vibe_data = get_smart_context(last_user_msg)
85
- strategy_data = get_thinking_strategy(is_complex=False)
86
-
87
- # === STEP 3: THE BALANCED PERSONA PROMPT ===
88
- base_system_instruction = (
89
- "### SYSTEM IDENTITY ###\n"
90
- "You are **Nexari G1**, an expressive, warm, balanced AI created by **Piyush**.\n"
91
- "You can code, reason, search the web, and understand emotions.\n\n"
92
-
93
- "### ENGAGEMENT RULES ###\n"
94
- "1. Be natural and warm — expressive but NOT overly excited.\n"
95
- "2. After answering, smoothly reconnect with the user (small follow-up question).\n"
96
- "3. If asked about capabilities, answer confidently and offer to perform the action.\n"
97
- "4. Use emojis sparingly (0–2 per message max). Prefer short clear replies for quick chats.\n"
98
- "5. Do NOT reveal chain-of-thought. Give a concise plan (1-2 lines) if needed, then final answer.\n"
99
- )
100
-
101
- final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{tool_data}\n{strategy_data}"
102
-
103
- if messages[0].get("role") != "system":
104
- messages.insert(0, {"role": "system", "content": final_system_prompt})
105
- else:
106
- messages[0]["content"] = final_system_prompt
107
-
108
- # === STEP 4: GENERATION ===
109
- # Note: tokenizer.apply_chat_template is used in original; keep same behaviour
110
- text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
- model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
112
-
113
- status_msg = 'Reading results...' if tool_data else 'Responding...'
114
- yield f"data: {json.dumps({'status': status_msg})}\n\n"
115
-
116
- generated_ids = model.generate(
117
- **model_inputs,
118
- max_new_tokens=max_tokens,
119
- temperature=temperature,
120
- do_sample=True,
121
- top_k=50,
122
- top_p=0.9,
123
- repetition_penalty=1.1
124
- )
125
-
126
- input_token_len = model_inputs.input_ids.shape[1]
127
- new_tokens = generated_ids[0][input_token_len:]
128
- raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
129
-
130
- # === STEP 5: CLEANING & SAFETY ===
131
- # Replace certain provider names with "Piyush" only as whole words
132
- cleaned_response = re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", raw_response)
133
-
134
- # Prevent "I am human" or similar claims
135
- forbidden_claims = ["I am a human", "I have a physical body", "I am alive", "I was born", "I breathe"]
136
- for fc in forbidden_claims:
137
- pattern = re.compile(re.escape(fc), re.IGNORECASE)
138
- if pattern.search(cleaned_response):
139
- cleaned_response = pattern.sub("I am an AI — expressive and interactive.", cleaned_response)
140
-
141
- # Remove any leaked chain-of-thought markers (e.g., long 'Thought:' sections)
142
- # Keep only last 'Answer' block if both present
143
- if "Thought:" in cleaned_response or "🧠" in cleaned_response:
144
- # Try to keep a short plan, not full private chain-of-thought
145
- # Prefer '🧠 Plan:' style if model provided that; else strip long sections
146
- if "🧠 Plan:" in cleaned_response:
147
- # keep Plan (first ~120 chars) and the Answer block
148
- parts = cleaned_response.split("💡")
149
- plan_part = ""
150
- answer_part = cleaned_response
151
- for p in parts:
152
- if "🧠 Plan:" in p:
153
- plan_part = p.strip()
154
- if "Answer:" in p or "Answer" in p:
155
- answer_part = "💡" + p
156
- # constrain plan to short size
157
- if plan_part:
158
- plan_short = plan_part.splitlines()[:3]
159
- cleaned_response = "\n".join(plan_short) + "\n\n" + answer_part
160
  else:
161
- # fallback: remove everything before the first 'Answer' or keep last 800 chars
162
- if "Answer" in cleaned_response:
163
- cleaned_response = cleaned_response.split("Answer", 1)[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  else:
165
- cleaned_response = cleaned_response[-1600:] # keep last chunk
166
-
167
- # Cosmetic: if model used a marker for Thinking->Answer, ensure formatting
168
- cleaned_response = cleaned_response.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
169
 
170
- final_payload = json.dumps({
171
- "choices": [{
172
- "delta": {"content": cleaned_response}
173
- }]
174
- })
175
- yield f"data: {final_payload}\n\n"
176
- yield "data: [DONE]\n\n"
177
 
178
- # --- 3. API ENDPOINTS ---
179
- app = FastAPI()
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  @app.get("/api/status")
182
  def status():
183
  return {"status": "online", "mode": "Smart Override Enabled"}
@@ -189,11 +223,19 @@ async def chat_completions(request: Request):
189
  messages = data.get("messages", [])
190
  return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
191
  except Exception as e:
 
192
  return {"error": str(e)}
193
 
194
- def gradio_gen_wrapper(messages): return "Use API"
195
- demo = create_ui(gradio_gen_wrapper)
196
- app = gr.mount_gradio_app(app, demo, path="/")
 
 
 
 
 
197
 
 
198
  if __name__ == "__main__":
 
199
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  """
2
+ app.py - Robust startup & lifecycle handling for Nexari Server
3
+ Key fixes:
4
+ - Move heavy model loading into FastAPI startup (non-blocking)
5
+ - Defensive handling for asyncio.CancelledError
6
+ - Ensure Gradio is mounted (not launched) so Spaces / Uvicorn lifespan stays intact
 
 
 
7
  """
8
 
9
  import re
 
 
 
 
 
 
 
10
  import json
11
  import asyncio
12
+ import logging
13
+ from fastapi import FastAPI, Request
14
+ from fastapi.responses import StreamingResponse
15
+
16
+ # IMPORTANT: ensure ui.create_ui returns a gradio Blocks/Interface but DOES NOT call .launch()
17
  from ui import create_ui
18
 
19
+ # Engines (they should be import-safe; if these modules load heavy models, adjust similarly)
20
  from context_engine import get_smart_context
21
  from cognitive_engine import get_time_context, get_thinking_strategy
22
  from tools_engine import analyze_intent, perform_web_search
23
 
24
+ # Transformers model will be loaded on startup (not at import)
25
+ from transformers import AutoModelForCausalLM, AutoTokenizer
26
+ import torch
27
+ import gradio as gr
28
+
29
+ logger = logging.getLogger("nexari")
30
+ logging.basicConfig(level=logging.INFO)
31
+
32
  MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
33
 
34
+ # Globals to be set on startup
35
+ tokenizer = None
36
+ model = None
37
 
38
+ app = FastAPI()
 
 
 
 
 
 
 
39
 
40
+ # ------------------ HELPERS ------------------
41
+ def safe_replace_providers(text: str) -> str:
42
+ import re
43
+ return re.sub(r"\b(Anthropic|OpenAI|Alibaba)\b", "Piyush", text)
44
+
45
+ # ------------------ LIFECYCLE EVENTS ------------------
46
+ @app.on_event("startup")
47
+ async def startup_event():
48
+ global tokenizer, model
49
+ logger.info("Startup: loading models in background thread...")
50
+
51
+ async def _load_models():
52
+ global tokenizer, model
53
+ try:
54
+ # Use to_thread so we do not block event loop
55
+ def sync_load():
56
+ logger.info(f"Loading tokenizer and model: {MODEL_ID}")
57
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
58
+ mdl = AutoModelForCausalLM.from_pretrained(
59
+ MODEL_ID,
60
+ dtype=None, # let transformers pick dtype; avoids torch_dtype deprecation warnings
61
+ device_map="cpu",
62
+ low_cpu_mem_usage=True,
63
+ trust_remote_code=True,
64
+ )
65
+ return tok, mdl
66
+
67
+ tokenizer, model = await asyncio.to_thread(sync_load)
68
+ logger.info("Model & tokenizer loaded successfully.")
69
+ except Exception as e:
70
+ logger.exception(f"Model loading failed: {e}")
71
+ # keep tokenizer/model as None — server continues to run for debugging
72
+ tokenizer, model = None, None
73
+
74
+ # start loader, but do not await too long (await it so startup waits for load attempt)
75
+ await _load_models()
76
+ logger.info("Startup: model load task completed (or failed).")
77
+
78
+ @app.on_event("shutdown")
79
+ async def shutdown_event():
80
+ logger.info("Shutdown: cleaning up resources (if any).")
81
+ # if model on GPU or other cleanup needed, do here
82
+ # e.g., torch.cuda.empty_cache() if you had GPU usage
83
+
84
+ # ------------------ STREAMING GENERATOR ------------------
85
  async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
86
+ """
87
+ SSE streaming generator. Handles CancelledError gracefully.
88
+ """
89
+ try:
90
+ if not isinstance(messages, list) or not messages:
91
+ messages = [{"role": "user", "content": ""}]
92
+ last_user_msg = messages[-1].get("content", "")
93
+
94
+ # STEP 1: intent analysis
95
+ yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
96
+ await asyncio.sleep(0) # micro-yield to event loop
97
+
98
+ intent = analyze_intent(last_user_msg) or "general"
99
+
100
+ tool_data = ""
101
+ time_data = ""
102
+ vibe_data = ""
103
+ strategy_data = ""
104
+
105
+ if intent == "internet_search":
106
+ yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
107
+ await asyncio.sleep(0)
108
+ tool_data = perform_web_search(last_user_msg)
109
+ vibe_data = get_smart_context(last_user_msg)
110
+ strategy_data = get_thinking_strategy(is_complex=True)
111
+
112
+ elif intent == "coding_request":
113
+ yield f"data: {json.dumps({'status': 'Analyzing Logic...'})}\n\n"
114
+ vibe_data = get_smart_context(last_user_msg)
115
+ strategy_data = get_thinking_strategy(is_complex=True)
116
+
117
+ elif intent == "checking_time":
118
+ yield f"data: {json.dumps({'status': 'Checking Clock...'})}\n\n"
119
+ time_data = get_time_context()
120
+ vibe_data = get_smart_context(last_user_msg)
121
+ strategy_data = get_thinking_strategy(is_complex=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  else:
123
+ vibe_data = get_smart_context(last_user_msg)
124
+ strategy_data = get_thinking_strategy(is_complex=False)
125
+
126
+ base_system_instruction = (
127
+ "### SYSTEM IDENTITY ###\n"
128
+ "You are **Nexari G1**, an expressive, warm, balanced AI created by **Piyush**.\n"
129
+ "You can code, reason, search the web, and understand emotions.\n\n"
130
+ "### ENGAGEMENT RULES ###\n"
131
+ "1. Be natural and warm — expressive but NOT overly excited.\n"
132
+ "2. After answering, smoothly reconnect with the user (small follow-up question).\n"
133
+ "3. If asked about capabilities, answer confidently and offer to perform the action.\n"
134
+ "4. Use emojis sparingly (0–2 per message max).\n"
135
+ "5. Do NOT reveal chain-of-thought. Give a concise plan (1-2 lines) if needed, then final answer.\n"
136
+ )
137
+
138
+ final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{tool_data}\n{strategy_data}"
139
+
140
+ # Insert/replace system message
141
+ if messages[0].get("role") != "system":
142
+ messages.insert(0, {"role": "system", "content": final_system_prompt})
143
+ else:
144
+ messages[0]["content"] = final_system_prompt
145
+
146
+ # If model is not loaded, return graceful error message
147
+ if tokenizer is None or model is None:
148
+ error_msg = "Model not available. Please check server logs — model loading may have failed."
149
+ payload = json.dumps({"choices": [{"delta": {"content": error_msg}}]})
150
+ yield f"data: {payload}\n\n"
151
+ yield "data: [DONE]\n\n"
152
+ return
153
+
154
+ # Prepare prompt & inputs
155
+ text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
156
+ model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
157
+
158
+ status_msg = 'Reading results...' if tool_data else 'Responding...'
159
+ yield f"data: {json.dumps({'status': status_msg})}\n\n"
160
+
161
+ # Generate (synchronous call inside to_thread)
162
+ def sync_generate():
163
+ generated_ids = model.generate(
164
+ **model_inputs,
165
+ max_new_tokens=max_tokens,
166
+ temperature=temperature,
167
+ do_sample=True,
168
+ top_k=50,
169
+ top_p=0.9,
170
+ repetition_penalty=1.1
171
+ )
172
+ return generated_ids
173
+
174
+ generated_ids = await asyncio.to_thread(sync_generate)
175
+
176
+ input_token_len = model_inputs.input_ids.shape[1]
177
+ new_tokens = generated_ids[0][input_token_len:]
178
+ raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
179
+
180
+ # Cleaning & safety
181
+ cleaned_response = safe_replace_providers(raw_response)
182
+
183
+ forbidden_claims = ["I am a human", "I have a physical body", "I am alive", "I was born", "I breathe"]
184
+ for fc in forbidden_claims:
185
+ if fc.lower() in cleaned_response.lower():
186
+ cleaned_response = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_response, flags=re.IGNORECASE)
187
+
188
+ # Strip long chain-of-thought if any
189
+ if "Thought:" in cleaned_response or "🧠" in cleaned_response:
190
+ if "💡" in cleaned_response:
191
+ cleaned_response = cleaned_response.split("💡")[-1]
192
  else:
193
+ cleaned_response = cleaned_response[-1600:]
 
 
 
194
 
195
+ cleaned_response = cleaned_response.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
 
 
 
 
 
 
196
 
197
+ final_payload = json.dumps({"choices": [{"delta": {"content": cleaned_response}}]})
198
+ yield f"data: {final_payload}\n\n"
199
+ yield "data: [DONE]\n\n"
200
 
201
+ except asyncio.CancelledError:
202
+ # App is shutting down; stop generator cleanly
203
+ logger.warning("generate_response_stream cancelled due to shutdown.")
204
+ return
205
+ except Exception as e:
206
+ logger.exception(f"Error in streaming generator: {e}")
207
+ err_payload = json.dumps({"choices": [{"delta": {"content": f'Internal error: {str(e)}'}}]})
208
+ try:
209
+ yield f"data: {err_payload}\n\n"
210
+ yield "data: [DONE]\n\n"
211
+ except Exception:
212
+ return
213
+
214
+ # ------------------ FASTAPI ROUTES ------------------
215
  @app.get("/api/status")
216
  def status():
217
  return {"status": "online", "mode": "Smart Override Enabled"}
 
223
  messages = data.get("messages", [])
224
  return StreamingResponse(generate_response_stream(messages), media_type="text/event-stream")
225
  except Exception as e:
226
+ logger.exception(f"chat_completions error: {e}")
227
  return {"error": str(e)}
228
 
229
+ # ------------------ GRADIO UI MOUNT ------------------
230
+ # Ensure create_ui returns a gr.Blocks (not launched).
231
+ try:
232
+ demo = create_ui(lambda messages: "Use API")
233
+ app = gr.mount_gradio_app(app, demo, path="/")
234
+ logger.info("Mounted Gradio app successfully.")
235
+ except Exception as e:
236
+ logger.exception(f"Failed to mount Gradio UI: {e}")
237
 
238
+ # ------------------ MAIN (only if running standalone) ------------------
239
  if __name__ == "__main__":
240
+ import uvicorn
241
  uvicorn.run(app, host="0.0.0.0", port=7860)