Nexari-Research commited on
Commit
cc870c0
·
verified ·
1 Parent(s): 7bcdc81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -49
app.py CHANGED
@@ -1,11 +1,4 @@
1
- """
2
- app.py - Nexari Server (Web-tool + Detail-request fixes)
3
- Key changes:
4
- - When web search performed, pass structured WEB_DATA as an assistant message so the model MUST use it.
5
- - Detect "detailed/line-by-line" user requests and increase max_tokens & enforce numbered output format.
6
- - Minor safety & streaming robustness retained.
7
- """
8
-
9
  import re
10
  import json
11
  import asyncio
@@ -21,13 +14,15 @@ from tools_engine import analyze_intent, perform_web_search
21
  from transformers import AutoModelForCausalLM, AutoTokenizer
22
  import torch
23
  import gradio as gr
 
24
 
25
  logger = logging.getLogger("nexari")
26
  logging.basicConfig(level=logging.INFO)
27
 
28
- MODEL_ID = "Piyush-boss/Nexari-Qwen-3B-Full"
29
  tokenizer = None
30
  model = None
 
31
 
32
  app = FastAPI()
33
 
@@ -45,28 +40,52 @@ def is_detailed_request(text: str) -> bool:
45
 
46
  @app.on_event("startup")
47
  async def startup_event():
48
- global tokenizer, model
49
- logger.info("Startup: loading tokenizer/model in background thread...")
50
  try:
 
 
 
 
 
 
51
  def sync_load():
52
  tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
53
  mdl = AutoModelForCausalLM.from_pretrained(
54
  MODEL_ID,
55
- dtype=None,
56
- device_map="cpu",
57
- low_cpu_mem_usage=True,
58
  trust_remote_code=True,
 
 
59
  )
 
 
60
  return tok, mdl
 
61
  tokenizer, model = await asyncio.to_thread(sync_load)
62
- logger.info("Model loaded successfully.")
63
  except Exception as e:
64
- logger.exception(f"Model loading failed: {e}")
65
  tokenizer, model = None, None
66
 
67
- @app.on_event("shutdown")
68
- async def shutdown_event():
69
- logger.info("Shutdown: cleanup if necessary.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
72
  try:
@@ -74,7 +93,6 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
74
  messages = [{"role":"user","content":""}]
75
  last_user_msg = messages[-1].get("content","")
76
 
77
- # initial thinking status
78
  yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
79
  await asyncio.sleep(0)
80
 
@@ -85,19 +103,19 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
85
  vibe_data = ""
86
  strategy_data = ""
87
 
88
- # detect if user explicitly asked for long/detailed format
89
  want_detailed = is_detailed_request(last_user_msg)
90
  if want_detailed:
91
- # bump tokens to allow long/line-by-line answer
92
  max_tokens = max(max_tokens, 1200)
93
- temperature = min(temperature, 0.9) # keep somewhat controlled
94
 
95
- # Route based on intent
96
  if intent == "internet_search":
97
  yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
98
  await asyncio.sleep(0)
99
- # perform_web_search now returns structured dict or empty string
100
- tool_data_struct = perform_web_search(last_user_msg)
 
 
 
101
  vibe_data = get_smart_context(last_user_msg)
102
  strategy_data = get_thinking_strategy(is_complex=True, detail=want_detailed)
103
 
@@ -116,7 +134,6 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
116
  vibe_data = get_smart_context(last_user_msg)
117
  strategy_data = get_thinking_strategy(is_complex=False, detail=want_detailed)
118
 
119
- # Base system instruction with explicit web-data usage rule
120
  base_system_instruction = (
121
  "### SYSTEM IDENTITY ###\n"
122
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
@@ -130,17 +147,14 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
130
 
131
  final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{strategy_data}"
132
 
133
- # ensure system message present
134
  if messages[0].get("role") != "system":
135
  messages.insert(0, {"role":"system","content": final_system_prompt})
136
  else:
137
  messages[0]["content"] = final_system_prompt
138
 
139
- # If we have tool_data_struct (dict with items & sources), add as assistant message
140
  if tool_data_struct:
141
- # create a clear WEB_DATA assistant message that model must consume
142
  web_block = "### WEB_DATA (from live search) ###\n"
143
- # include numbered sources with short titles, snippets, and urls
144
  items = tool_data_struct.get("results", [])
145
  if items:
146
  lines = []
@@ -153,12 +167,8 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
153
  web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
154
  else:
155
  web_block += "No results found."
156
-
157
- # Insert the web block as an assistant message so model treats it as retrieved evidence
158
- # Insert after system message (index 1)
159
  messages.insert(1, {"role":"assistant","content": web_block})
160
 
161
- # Model availability check
162
  if tokenizer is None or model is None:
163
  err = "Model is not loaded on server. Please check logs."
164
  payload = json.dumps({"choices":[{"delta":{"content": err}}]})
@@ -166,15 +176,22 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
166
  yield "data: [DONE]\n\n"
167
  return
168
 
169
- # prepare prompt
170
- text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
171
- model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
172
 
173
  status_msg = 'Reading results...' if tool_data_struct else 'Responding...'
174
  yield f"data: {json.dumps({'status': status_msg})}\n\n"
175
 
176
- # Generation in thread
177
  def sync_generate():
 
178
  return model.generate(
179
  **model_inputs,
180
  max_new_tokens=max_tokens,
@@ -184,26 +201,29 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
184
  top_p=0.92,
185
  repetition_penalty=1.08
186
  )
187
- generated_ids = await asyncio.to_thread(sync_generate)
 
 
 
 
 
 
 
188
 
189
- input_len = model_inputs.input_ids.shape[1]
190
  new_tokens = generated_ids[0][input_len:]
191
  raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
192
 
193
- # Cleaning & safety
194
  cleaned = safe_replace_providers(raw_response)
195
  forbidden = ["I am a human","I have a physical body","I am alive"]
196
  for fc in forbidden:
197
  if fc.lower() in cleaned.lower():
198
  cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
199
 
200
- # If detailed requested, encourage numbered formatting if model didn't follow
201
  if want_detailed:
202
- # simple heuristic: if no numbered lines present, add an instruction prefix
203
  if not re.search(r"^\s*\d+[\.\)]\s+", cleaned, re.M):
204
- cleaned = "1) " + cleaned.replace("\n", "\n2) ") # best-effort reformat
205
 
206
- # Format Answer tag
207
  cleaned = cleaned.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
208
 
209
  payload = json.dumps({"choices":[{"delta":{"content": cleaned}}]})
@@ -224,7 +244,8 @@ async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
224
 
225
  @app.get("/api/status")
226
  def status():
227
- return {"status":"online","mode":"Smart Override Enabled"}
 
228
 
229
  @app.post("/v1/chat/completions")
230
  async def chat_completions(request: Request):
@@ -236,9 +257,10 @@ async def chat_completions(request: Request):
236
  logger.exception(f"chat_completions endpoint error: {e}")
237
  return {"error": str(e)}
238
 
239
- # Mount gradio if create_ui returns Blocks (must not call .launch())
240
  try:
241
  demo = create_ui(lambda messages: "Use API")
 
242
  app = gr.mount_gradio_app(app, demo, path="/")
243
  logger.info("Gradio mounted.")
244
  except Exception as e:
@@ -246,4 +268,4 @@ except Exception as e:
246
 
247
  if __name__ == "__main__":
248
  import uvicorn
249
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # app.py - UPDATED (replace original)
 
 
 
 
 
 
 
2
  import re
3
  import json
4
  import asyncio
 
14
  from transformers import AutoModelForCausalLM, AutoTokenizer
15
  import torch
16
  import gradio as gr
17
+ import os
18
 
19
  logger = logging.getLogger("nexari")
20
  logging.basicConfig(level=logging.INFO)
21
 
22
+ MODEL_ID = os.environ.get("MODEL_ID", "Piyush-boss/Nexari-Qwen-3B-Full")
23
  tokenizer = None
24
  model = None
25
+ device = "cpu"
26
 
27
  app = FastAPI()
28
 
 
40
 
41
  @app.on_event("startup")
42
  async def startup_event():
43
+ global tokenizer, model, device
44
+ logger.info("Startup: initiating background model load...")
45
  try:
46
+ # choose device more safely
47
+ if torch.cuda.is_available():
48
+ device = "cuda"
49
+ else:
50
+ device = "cpu"
51
+
52
  def sync_load():
53
  tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
54
+ # Use device_map when possible; use cpu if no GPU
55
  mdl = AutoModelForCausalLM.from_pretrained(
56
  MODEL_ID,
 
 
 
57
  trust_remote_code=True,
58
+ low_cpu_mem_usage=(device == "cpu"),
59
+ device_map="auto" if device == "cuda" else None
60
  )
61
+ if device == "cpu":
62
+ mdl.to("cpu")
63
  return tok, mdl
64
+
65
  tokenizer, model = await asyncio.to_thread(sync_load)
66
+ logger.info("Model loaded successfully on %s.", device)
67
  except Exception as e:
68
+ logger.exception(f"Model loading failed at startup: {e}")
69
  tokenizer, model = None, None
70
 
71
+ def _build_prompt_from_messages(messages):
72
+ """
73
+ Build a single-string prompt for models in case tokenizer.apply_chat_template
74
+ is unavailable (fallback).
75
+ """
76
+ parts = []
77
+ for m in messages:
78
+ role = m.get("role","user")
79
+ content = m.get("content","")
80
+ if role == "system":
81
+ parts.append(f"[SYSTEM]\n{content}\n")
82
+ elif role == "user":
83
+ parts.append(f"[USER]\n{content}\n")
84
+ elif role == "assistant":
85
+ parts.append(f"[ASSISTANT]\n{content}\n")
86
+ else:
87
+ parts.append(content)
88
+ return "\n".join(parts)
89
 
90
  async def generate_response_stream(messages, max_tokens=600, temperature=0.85):
91
  try:
 
93
  messages = [{"role":"user","content":""}]
94
  last_user_msg = messages[-1].get("content","")
95
 
 
96
  yield f"data: {json.dumps({'status': 'Thinking...'})}\n\n"
97
  await asyncio.sleep(0)
98
 
 
103
  vibe_data = ""
104
  strategy_data = ""
105
 
 
106
  want_detailed = is_detailed_request(last_user_msg)
107
  if want_detailed:
 
108
  max_tokens = max(max_tokens, 1200)
109
+ temperature = min(temperature, 0.9)
110
 
 
111
  if intent == "internet_search":
112
  yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
113
  await asyncio.sleep(0)
114
+ try:
115
+ tool_data_struct = perform_web_search(last_user_msg)
116
+ except Exception as e:
117
+ logger.exception("Web search failed: %s", e)
118
+ tool_data_struct = {"query": last_user_msg, "results": []}
119
  vibe_data = get_smart_context(last_user_msg)
120
  strategy_data = get_thinking_strategy(is_complex=True, detail=want_detailed)
121
 
 
134
  vibe_data = get_smart_context(last_user_msg)
135
  strategy_data = get_thinking_strategy(is_complex=False, detail=want_detailed)
136
 
 
137
  base_system_instruction = (
138
  "### SYSTEM IDENTITY ###\n"
139
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
 
147
 
148
  final_system_prompt = f"{base_system_instruction}\n{vibe_data}\n{time_data}\n{strategy_data}"
149
 
150
+ # Ensure system message present
151
  if messages[0].get("role") != "system":
152
  messages.insert(0, {"role":"system","content": final_system_prompt})
153
  else:
154
  messages[0]["content"] = final_system_prompt
155
 
 
156
  if tool_data_struct:
 
157
  web_block = "### WEB_DATA (from live search) ###\n"
 
158
  items = tool_data_struct.get("results", [])
159
  if items:
160
  lines = []
 
167
  web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
168
  else:
169
  web_block += "No results found."
 
 
 
170
  messages.insert(1, {"role":"assistant","content": web_block})
171
 
 
172
  if tokenizer is None or model is None:
173
  err = "Model is not loaded on server. Please check logs."
174
  payload = json.dumps({"choices":[{"delta":{"content": err}}]})
 
176
  yield "data: [DONE]\n\n"
177
  return
178
 
179
+ # Build prompt with tokenizer helper if available, else fallback
180
+ try:
181
+ if hasattr(tokenizer, "apply_chat_template"):
182
+ text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
183
+ else:
184
+ text_prompt = _build_prompt_from_messages(messages)
185
+ except Exception:
186
+ text_prompt = _build_prompt_from_messages(messages)
187
+
188
+ model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
189
 
190
  status_msg = 'Reading results...' if tool_data_struct else 'Responding...'
191
  yield f"data: {json.dumps({'status': status_msg})}\n\n"
192
 
 
193
  def sync_generate():
194
+ # wrap generate in try to catch resource errors
195
  return model.generate(
196
  **model_inputs,
197
  max_new_tokens=max_tokens,
 
201
  top_p=0.92,
202
  repetition_penalty=1.08
203
  )
204
+ try:
205
+ generated_ids = await asyncio.to_thread(sync_generate)
206
+ except RuntimeError as e:
207
+ logger.exception("Generation failed (possible OOM): %s", e)
208
+ err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
209
+ yield f"data: {err_payload}\n\n"
210
+ yield "data: [DONE]\n\n"
211
+ return
212
 
213
+ input_len = model_inputs["input_ids"].shape[1]
214
  new_tokens = generated_ids[0][input_len:]
215
  raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
216
 
 
217
  cleaned = safe_replace_providers(raw_response)
218
  forbidden = ["I am a human","I have a physical body","I am alive"]
219
  for fc in forbidden:
220
  if fc.lower() in cleaned.lower():
221
  cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
222
 
 
223
  if want_detailed:
 
224
  if not re.search(r"^\s*\d+[\.\)]\s+", cleaned, re.M):
225
+ cleaned = "1) " + cleaned.replace("\n", "\n2) ")
226
 
 
227
  cleaned = cleaned.replace("💡 **Answer:**", "\n\n---\n💡 **Answer:**")
228
 
229
  payload = json.dumps({"choices":[{"delta":{"content": cleaned}}]})
 
244
 
245
  @app.get("/api/status")
246
  def status():
247
+ ok = tokenizer is not None and model is not None
248
+ return {"status":"online" if ok else "degraded", "mode":"Smart Override Enabled", "model_loaded": ok}
249
 
250
  @app.post("/v1/chat/completions")
251
  async def chat_completions(request: Request):
 
257
  logger.exception(f"chat_completions endpoint error: {e}")
258
  return {"error": str(e)}
259
 
260
+ # Mount gradio if create_ui returns Blocks
261
  try:
262
  demo = create_ui(lambda messages: "Use API")
263
+ # demo should be a Blocks or Component; mount safely
264
  app = gr.mount_gradio_app(app, demo, path="/")
265
  logger.info("Gradio mounted.")
266
  except Exception as e:
 
268
 
269
  if __name__ == "__main__":
270
  import uvicorn
271
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))