Nexari-Research commited on
Commit
cc35d21
·
verified ·
1 Parent(s): abad315

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -421
app.py CHANGED
@@ -1,6 +1,4 @@
1
- # app.py - UPDATED (smart hybrid: fast-path + quality-checked fallback to planning)
2
- # Behavior: use behavior_model.route to fast-path; after fast attempt run quality checks;
3
- # if quality low -> fallback to planning route automatically (one fallback only per request).
4
  import re
5
  import json
6
  import asyncio
@@ -186,58 +184,19 @@ def extract_and_sanitize_plan(text: str, max_plan_chars: int = 240) -> (str, str
186
  return plan_label, cleaned_body
187
  return None, text
188
 
189
- # -------------------------
190
- # Heuristic quality check for fast-path responses
191
- # -------------------------
192
- _LOW_QUALITY_PHRASES = [
193
- "i'm here to help", "is there something specific", "i can help with that", "let me know",
194
- "do you want", "what would you like", "please clarify", "sorry, i don't", "i don't have"
195
- ]
196
-
197
- def _is_low_quality_text(text: str, min_words_hint: int) -> (bool, Dict[str,Any]):
198
- """
199
- Returns (is_low_quality, debug_info)
200
- Heuristics:
201
- - If word count < min_words_hint => low quality
202
- - If response starts with generic short filler phrases => low quality
203
- - If too short (<6 words) => low quality
204
- - If contains many placeholders like 'I don't know' or 'sorry' => low quality
205
- """
206
- t = (text or "").strip()
207
- wc = word_count(t)
208
- lower = t.lower()
209
-
210
- reasons = []
211
- if wc < max(6, min_words_hint // 2):
212
- reasons.append(f"word_count_too_small ({wc} < {max(6, min_words_hint // 2)})")
213
- if wc < min_words_hint:
214
- # not strict failure for very small min_words_hint (like 6), but flagged
215
- reasons.append(f"below_min_hint ({wc} < {min_words_hint})")
216
- for ph in _LOW_QUALITY_PHRASES:
217
- if lower.startswith(ph):
218
- reasons.append(f"starts_with_generic_phrase ({ph})")
219
- break
220
- # placeholder detection
221
- placeholders = ["i don't know", "i'm not sure", "i do not know", "can't help", "unable to"]
222
- for ph in placeholders:
223
- if ph in lower:
224
- reasons.append(f"contains_placeholder ({ph})")
225
- break
226
-
227
- # if many short sentences like "Okay. Sure." count as low quality
228
- sent_count = len(re.findall(r"[.!?]+", t)) or 1
229
- if wc < 12 and sent_count >= 2:
230
- reasons.append("fragmented_short_sentences")
231
-
232
- is_low = len(reasons) > 0
233
- debug = {"word_count": wc, "min_words_hint": min_words_hint, "reasons": reasons}
234
- return is_low, debug
235
 
236
  # -------------------------
237
- # Streaming generator with smart fallback:
238
- # - Fast-path tries one attempt
239
- # - If quality low -> fallback to planning route
240
- # - Avoid infinite loops with fallback_once flag
241
  # -------------------------
242
  async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
243
  try:
@@ -269,207 +228,46 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
269
  logger.exception("Flow analysis failed: %s", e)
270
  flow_context = {}
271
 
 
 
 
 
 
 
 
 
272
  # Log route decision
273
  route = flow_context.get("route", "planning")
274
- complexity_score = flow_context.get("complexity_score", None)
275
- logger.info("Flow route: %s (score=%s)", route, complexity_score)
276
-
277
- # allow fallback once from direct -> planning
278
- fallback_once = False
279
-
280
- # Helper to run the planning route (reusable)
281
- async def run_planning_route(messages_local, flow_context_local, last_user_msg_local, min_tokens_hint=None):
282
- """
283
- Runs the full planning pipeline (same as previous planning branch).
284
- Returns generated_text and meta dict.
285
- """
286
- # Emit Reasoning indicator BEFORE heavy planning so UI shows it during planning
287
- yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
288
- await asyncio.sleep(0.12)
289
-
290
- vibe_block = get_smart_context(last_user_msg_local)
291
- plan_req = plan_response_requirements(messages_local, last_user_msg_local, flow_context_local, vibe_block)
292
- min_words = plan_req["min_words"]
293
- strictness = plan_req["strictness"]
294
-
295
- # adjust tokens/temperature if strict
296
- nonlocal temperature, max_tokens
297
- if strictness:
298
- temperature = min(temperature + 0.05, 0.95)
299
- max_tokens = max(max_tokens, min_words // 2 + 120)
300
-
301
- strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
302
- time_data = get_time_context()
303
-
304
- base_system_instruction = (
305
- "### SYSTEM IDENTITY ###\n"
306
- "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
307
- "### RULES ###\n"
308
- "1) If WEB_DATA is provided, prioritize it and cite sources.\n"
309
- "2) Avoid chain-of-thought exposure. If requested to provide a short 'Plan', keep it concise (max 2 lines) and label it '🧠 Plan:'.\n"
310
- "3) Use natural phrasing; follow emoji & verbosity guidance below.\n"
311
- )
312
-
313
- flow_desc = ""
314
- if flow_context_local:
315
- label = flow_context_local.get("flow_label","unknown")
316
- conf = round(float(flow_context_local.get("confidence", 0.0)), 2)
317
- expl = flow_context_local.get("explanation", "")
318
- flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
319
-
320
- final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
321
-
322
- if messages_local and messages_local[0].get("role") == "system":
323
- messages_local[0]["content"] = final_system_prompt
324
- else:
325
- messages_local.insert(0, {"role":"system","content": final_system_prompt})
326
-
327
- # web search if needed
328
- tool_data_struct = None
329
- if intent == "internet_search":
330
- yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
331
- await asyncio.sleep(0)
332
- try:
333
- tool_data_struct = perform_web_search(last_user_msg_local)
334
- except Exception as e:
335
- logger.exception("Web search failed: %s", e)
336
- tool_data_struct = {"query": last_user_msg_local, "results": []}
337
-
338
- if tool_data_struct:
339
- web_block = "### WEB_DATA (from live search) ###\n"
340
- items = tool_data_struct.get("results", [])
341
- if items:
342
- lines = []
343
- for idx, it in enumerate(items, start=1):
344
- title = it.get("title","(no title)").strip()
345
- snippet = it.get("snippet","").replace("\n"," ").strip()
346
- url = it.get("url","")
347
- lines.append(f"{idx}. {title}\n {snippet}\n SOURCE: {url}")
348
- web_block += "\n".join(lines)
349
- web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
350
- else:
351
- web_block += "No results found."
352
- messages_local.insert(1, {"role":"assistant","content": web_block})
353
-
354
- if tokenizer is None or model is None:
355
- err = "Model not loaded. Check server logs."
356
- payload = json.dumps({"choices":[{"delta":{"content": err}}]})
357
- yield f"data: {payload}\n\n"
358
- yield "data: [DONE]\n\n"
359
- return None, {"error":"model_not_loaded"}
360
-
361
- try:
362
- if hasattr(tokenizer, "apply_chat_template"):
363
- text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
364
- else:
365
- text_prompt_local = _build_prompt_from_messages(messages_local)
366
- except Exception:
367
- text_prompt_local = _build_prompt_from_messages(messages_local)
368
-
369
- # ---------- GENERATION STAGE ----------
370
- max_attempts_local = 2
371
- attempts_local = 0
372
- last_meta_local = {}
373
- generated_text_local = ""
374
- cleaned_local = ""
375
- while attempts_local < max_attempts_local:
376
- attempts_local += 1
377
- # Emit explicit generating label (after planning completed)
378
- yield f"data: {json.dumps({'status': f'Generating LLM ({attempts_local})...'})}\n\n"
379
- await asyncio.sleep(0.06)
380
-
381
- model_inputs_local = tokenizer(text_prompt_local, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
382
-
383
- def sync_generate_local():
384
- return model.generate(
385
- **model_inputs_local,
386
- max_new_tokens=max_tokens,
387
- temperature=temperature,
388
- do_sample=True,
389
- top_k=50,
390
- top_p=0.92,
391
- repetition_penalty=1.08
392
- )
393
- try:
394
- generated_ids_local = await asyncio.to_thread(sync_generate_local)
395
- except RuntimeError as e:
396
- logger.exception("Generation failed (possible OOM): %s", e)
397
- err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
398
- yield f"data: {err_payload}\n\n"
399
- yield "data: [DONE]\n\n"
400
- return None, {"error":"generation_failed"}
401
-
402
- input_len_local = model_inputs_local["input_ids"].shape[1]
403
- new_tokens_local = generated_ids_local[0][input_len_local:]
404
- raw_response_local = tokenizer.decode(new_tokens_local, skip_special_tokens=True).strip()
405
- cleaned_local = safe_replace_providers(raw_response_local)
406
-
407
- forbidden = ["I am a human","I have a physical body","I am alive"]
408
- for fc in forbidden:
409
- if fc.lower() in cleaned_local.lower():
410
- cleaned_local = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_local, flags=re.IGNORECASE)
411
-
412
- plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
413
- wc_local = word_count(cleaned_body_local)
414
- last_meta_local = {"attempt": attempts_local, "word_count": wc_local, "raw_len": len(cleaned_body_local)}
415
-
416
- if wc_local >= min_words or attempts_local >= max_attempts_local or plan_req["strictness"] == 0:
417
- generated_text_local = cleaned_body_local
418
- if plan_label_local:
419
- generated_text_local = plan_label_local + "\n\n" + generated_text_local
420
- break
421
- else:
422
- expand_note_local = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
423
- if messages_local and messages_local[0].get("role") == "system":
424
- messages_local[0]["content"] = messages_local[0]["content"] + "\n" + expand_note_local
425
- else:
426
- messages_local.insert(0, {"role":"system","content": expand_note_local})
427
- temperature = min(temperature + 0.07, 0.98)
428
- try:
429
- if hasattr(tokenizer, "apply_chat_template"):
430
- text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
431
- else:
432
- text_prompt_local = _build_prompt_from_messages(messages_local)
433
- except Exception:
434
- text_prompt_local = _build_prompt_from_messages(messages_local)
435
- await asyncio.sleep(0.02)
436
- continue
437
 
438
- if not generated_text_local:
439
- plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
440
- generated_text_local = (plan_label_local + "\n\n" if plan_label_local else "") + (cleaned_body_local or cleaned_local)
441
-
442
- generated_text_local = re.sub(r"\bPlan\s*:\s*$", "", generated_text_local, flags=re.IGNORECASE).strip()
443
- generated_text_local = generated_text_local.replace("I can help with that.", "I can help with that — let me explain. 🙂")
444
-
445
- meta_local = {"generation_attempts": attempts_local, "last_attempt_meta": last_meta_local, "route": "planning", "complexity_score": flow_context_local.get("complexity_score")}
446
- return generated_text_local, meta_local
447
-
448
- # If direct route -> take fast-path (skip heavy planning UI status) but perform quality check
449
  if route == "direct":
450
- # provide explicit SSE status with route info
451
- yield f"data: {json.dumps({'status': 'Routing: direct (fast-path) - Generating...'})}\n\n"
452
- # Build a compact system prompt to keep responses concise
453
  base_system_instruction = (
454
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
455
- "Respond concisely and directly for short/simple user requests. "
456
- "Avoid chain-of-thought. Keep reply helpful and to the point."
457
  )
458
-
459
- # Minimal strategy/time insertion to avoid heavy planning
460
  time_data = get_time_context()
461
- # Keep a concise strategy
462
- strategy_data = get_thinking_strategy(is_complex=False, detail=False, min_words_hint=12)
463
 
464
- final_system_prompt = f"{base_system_instruction}\n{time_data}\n{strategy_data}"
465
- # ensure system message is present
466
  if messages and messages[0].get("role") == "system":
467
  messages[0]["content"] = final_system_prompt
468
  else:
469
  messages.insert(0, {"role":"system","content": final_system_prompt})
470
 
471
- # For direct route we use only 1 attempt to minimize latency
472
- max_attempts = 1
 
 
 
 
 
 
473
  tool_data_struct = None
474
  if intent == "internet_search":
475
  yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
@@ -496,7 +294,6 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
496
  web_block += "No results found."
497
  messages.insert(1, {"role":"assistant","content": web_block})
498
 
499
- # Proceed to generation stage (fast path)
500
  if tokenizer is None or model is None:
501
  err = "Model not loaded. Check server logs."
502
  payload = json.dumps({"choices":[{"delta":{"content": err}}]})
@@ -504,6 +301,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
504
  yield "data: [DONE]\n\n"
505
  return
506
 
 
507
  try:
508
  if hasattr(tokenizer, "apply_chat_template"):
509
  text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -515,11 +313,11 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
515
  attempts = 0
516
  generated_text = ""
517
  last_meta = {}
518
- cleaned = ""
519
  while attempts < max_attempts:
520
  attempts += 1
521
- yield f"data: {json.dumps({'status': f'Generating LLM (fast-path) ({attempts})...'})}\n\n"
522
  await asyncio.sleep(0.04)
 
523
  model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
524
 
525
  def sync_generate():
@@ -554,132 +352,58 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
554
  plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
555
  wc = word_count(cleaned_body)
556
  last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
557
- generated_text = cleaned_body
558
- if plan_label:
559
- generated_text = plan_label + "\n\n" + generated_text
560
- # end fast attempt
561
-
562
- # Quality check for fast-path output
563
- # Use min_words_hint derived from simple vibe/context
564
- vibe_block_fast = get_smart_context(last_user_msg)
565
- min_words_hint_fast = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block_fast)["min_words"]
566
- is_low_quality, debug_info = _is_low_quality_text(generated_text, min_words_hint_fast)
567
- logger.info("Fast-path quality check: low=%s debug=%s", is_low_quality, debug_info)
568
-
569
- if not is_low_quality:
570
- payload = json.dumps({
571
- "choices":[{"delta":{"content": generated_text}}],
572
- "generation_attempts": attempts,
573
- "last_attempt_meta": last_meta,
574
- "route": "direct",
575
- "complexity_score": complexity_score,
576
- "quality_debug": debug_info
577
- })
578
- yield f"data: {payload}\n\n"
579
- yield "data: [DONE]\n\n"
580
- return
581
- else:
582
- # fallback to planning route once
583
- fallback_once = True
584
- yield f"data: {json.dumps({'status': 'Fast result low-quality; falling back to planner...'})}\n\n"
585
- await asyncio.sleep(0.05)
586
- # prepare a new messages copy to avoid polluting original (remove prior system if present)
587
- messages_for_planning = [m.copy() for m in messages if m.get("role") != "system"]
588
- # re-insert the user's last message and preserved assistant web block if any
589
- # Insert original earlier context (we'll reconstruct system prompt in planning function)
590
- # Note: flow_context remains the same
591
- # Call planning route generator
592
- planning_gen = run_planning_route(messages_for_planning, flow_context, last_user_msg)
593
- # run the async generator and return its results
594
- # planning_gen is an async generator; iterate and yield any interim SSE from it
595
- planning_result_text = None
596
- planning_result_meta = None
597
- async for item in planning_gen:
598
- # item can be SSE strings from run_planning_route; yield them through
599
- yield item
600
- # run_planning_route returns via its final return - but since it's a generator
601
- # we arranged it to yield statuses and then return result via yielded payload below.
602
- # To keep implementation simple, re-run a synchronous planning helper to get final result:
603
- generated_text_planning, meta_planning = await _run_planning_sync(messages_for_planning, flow_context, last_user_msg)
604
- if generated_text_planning is None:
605
- # planning failed; fallback to fast text (even if low quality)
606
- payload = json.dumps({
607
- "choices":[{"delta":{"content": generated_text}}],
608
- "generation_attempts": attempts,
609
- "last_attempt_meta": last_meta,
610
- "route": "direct_fallback_failed",
611
- "complexity_score": complexity_score,
612
- "quality_debug": debug_info
613
- })
614
- yield f"data: {payload}\n\n"
615
- yield "data: [DONE]\n\n"
616
- return
617
  else:
618
- payload = json.dumps({
619
- "choices":[{"delta":{"content": generated_text_planning}}],
620
- "generation_attempts": meta_planning.get("generation_attempts"),
621
- "last_attempt_meta": meta_planning.get("last_attempt_meta"),
622
- "route": "planning_after_fast_fallback",
623
- "complexity_score": complexity_score,
624
- "quality_debug": debug_info
625
- })
626
- yield f"data: {payload}\n\n"
627
- yield "data: [DONE]\n\n"
628
- return
 
 
 
 
 
 
 
629
 
630
- # If not direct, or fallback not triggered, go full planning route:
631
- # We'll call a synchronous helper to produce planning response content and meta.
632
- planning_result_text, planning_meta = await _run_planning_sync(messages, flow_context, last_user_msg)
633
- if planning_result_text is None:
634
- err_payload = json.dumps({"choices":[{"delta":{"content":"Internal error: planning generation failed"}}]})
635
- yield f"data: {err_payload}\n\n"
 
 
636
  yield "data: [DONE]\n\n"
637
  return
638
- payload = json.dumps({
639
- "choices":[{"delta":{"content": planning_result_text}}],
640
- "generation_attempts": planning_meta.get("generation_attempts"),
641
- "last_attempt_meta": planning_meta.get("last_attempt_meta"),
642
- "route": "planning",
643
- "complexity_score": complexity_score
644
- })
645
- yield f"data: {payload}\n\n"
646
- yield "data: [DONE]\n\n"
647
- return
648
 
649
- except asyncio.CancelledError:
650
- logger.warning("Streaming cancelled.")
651
- return
652
- except Exception as e:
653
- logger.exception(f"Generator error: {e}")
654
- err_payload = json.dumps({"choices":[{"delta":{"content": f"Internal error: {e}"}}]})
655
- try:
656
- yield f"data: {err_payload}\n\n"
657
- yield "data: [DONE]\n\n"
658
- except Exception:
659
- return
660
 
661
- # -------------------------
662
- # Synchronous planning wrapper used by the async flow to avoid duplicating code.
663
- # We keep it as an async function that executes the planning generation synchronously
664
- # to return final text+meta for simplified control flow.
665
- # -------------------------
666
- async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_local):
667
- """
668
- Runs the planning generator synchronously and returns (generated_text, meta_dict)
669
- This re-uses the planning logic but in a simpler callable form (non-streaming).
670
- """
671
- try:
672
- vibe_block = get_smart_context(last_user_msg_local)
673
- plan_req = plan_response_requirements(messages_local, last_user_msg_local, flow_context_local, vibe_block)
674
  min_words = plan_req["min_words"]
675
  strictness = plan_req["strictness"]
676
 
677
- # adjust tokens/temperature if strict
678
- temp_local = temperature
679
- max_tok_local = max_tokens
680
  if strictness:
681
- temp_local = min(temp_local + 0.05, 0.95)
682
- max_tok_local = max(max_tok_local, min_words // 2 + 120)
683
 
684
  strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
685
  time_data = get_time_context()
@@ -694,27 +418,29 @@ async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_l
694
  )
695
 
696
  flow_desc = ""
697
- if flow_context_local:
698
- label = flow_context_local.get("flow_label","unknown")
699
- conf = round(float(flow_context_local.get("confidence", 0.0)), 2)
700
- expl = flow_context_local.get("explanation", "")
701
  flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
702
 
703
  final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
704
 
705
- if messages_local and messages_local[0].get("role") == "system":
706
- messages_local[0]["content"] = final_system_prompt
707
  else:
708
- messages_local.insert(0, {"role":"system","content": final_system_prompt})
709
 
710
  # web search if needed
711
  tool_data_struct = None
712
  if intent == "internet_search":
 
 
713
  try:
714
- tool_data_struct = perform_web_search(last_user_msg_local)
715
  except Exception as e:
716
  logger.exception("Web search failed: %s", e)
717
- tool_data_struct = {"query": last_user_msg_local, "results": []}
718
 
719
  if tool_data_struct:
720
  web_block = "### WEB_DATA (from live search) ###\n"
@@ -730,94 +456,119 @@ async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_l
730
  web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
731
  else:
732
  web_block += "No results found."
733
- messages_local.insert(1, {"role":"assistant","content": web_block})
734
 
735
  if tokenizer is None or model is None:
736
- return None, {"error":"model_not_loaded"}
 
 
 
 
737
 
738
  try:
739
  if hasattr(tokenizer, "apply_chat_template"):
740
- text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
741
  else:
742
- text_prompt_local = _build_prompt_from_messages(messages_local)
743
  except Exception:
744
- text_prompt_local = _build_prompt_from_messages(messages_local)
745
-
746
- # generation (planning)
747
- max_attempts_local = 2
748
- attempts_local = 0
749
- last_meta_local = {}
750
- generated_text_local = ""
751
- cleaned_local = ""
752
- while attempts_local < max_attempts_local:
753
- attempts_local += 1
754
- model_inputs_local = tokenizer(text_prompt_local, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
755
-
756
- def sync_generate_local():
 
 
757
  return model.generate(
758
- **model_inputs_local,
759
- max_new_tokens=max_tok_local,
760
- temperature=temp_local,
761
  do_sample=True,
762
  top_k=50,
763
  top_p=0.92,
764
  repetition_penalty=1.08
765
  )
766
  try:
767
- generated_ids_local = await asyncio.to_thread(sync_generate_local)
768
  except RuntimeError as e:
769
  logger.exception("Generation failed (possible OOM): %s", e)
770
- return None, {"error":"generation_failed"}
 
 
 
771
 
772
- input_len_local = model_inputs_local["input_ids"].shape[1]
773
- new_tokens_local = generated_ids_local[0][input_len_local:]
774
- raw_response_local = tokenizer.decode(new_tokens_local, skip_special_tokens=True).strip()
775
- cleaned_local = safe_replace_providers(raw_response_local)
776
 
777
  forbidden = ["I am a human","I have a physical body","I am alive"]
778
  for fc in forbidden:
779
- if fc.lower() in cleaned_local.lower():
780
- cleaned_local = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_local, flags=re.IGNORECASE)
781
 
782
- plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
783
- wc_local = word_count(cleaned_body_local)
784
- last_meta_local = {"attempt": attempts_local, "word_count": wc_local, "raw_len": len(cleaned_body_local)}
785
 
786
- if wc_local >= min_words or attempts_local >= max_attempts_local or plan_req["strictness"] == 0:
787
- generated_text_local = cleaned_body_local
788
- if plan_label_local:
789
- generated_text_local = plan_label_local + "\n\n" + generated_text_local
790
  break
791
  else:
792
- expand_note_local = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
793
- if messages_local and messages_local[0].get("role") == "system":
794
- messages_local[0]["content"] = messages_local[0]["content"] + "\n" + expand_note_local
795
  else:
796
- messages_local.insert(0, {"role":"system","content": expand_note_local})
797
- temp_local = min(temp_local + 0.07, 0.98)
798
  try:
799
  if hasattr(tokenizer, "apply_chat_template"):
800
- text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
801
  else:
802
- text_prompt_local = _build_prompt_from_messages(messages_local)
803
  except Exception:
804
- text_prompt_local = _build_prompt_from_messages(messages_local)
805
  await asyncio.sleep(0.02)
806
  continue
807
 
808
- if not generated_text_local:
809
- plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
810
- generated_text_local = (plan_label_local + "\n\n" if plan_label_local else "") + (cleaned_body_local or cleaned_local)
811
 
812
- generated_text_local = re.sub(r"\bPlan\s*:\s*$", "", generated_text_local, flags=re.IGNORECASE).strip()
813
- generated_text_local = generated_text_local.replace("I can help with that.", "I can help with that — let me explain. 🙂")
814
 
815
- meta_local = {"generation_attempts": attempts_local, "last_attempt_meta": last_meta_local}
816
- return generated_text_local, meta_local
 
 
 
 
 
 
 
 
817
 
 
 
 
818
  except Exception as e:
819
- logger.exception("Planning sync error: %s", e)
820
- return None, {"error":"planning_exception"}
 
 
 
 
 
821
 
822
  # -------------------------
823
  # Endpoints
@@ -864,4 +615,4 @@ except Exception as e:
864
 
865
  if __name__ == "__main__":
866
  import uvicorn
867
- uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
 
1
+ # app.py - UPDATED (adaptive direct-route for quality + planning-route unchanged)
 
 
2
  import re
3
  import json
4
  import asyncio
 
184
  return plan_label, cleaned_body
185
  return None, text
186
 
187
+ def _is_low_quality(reply: str) -> bool:
188
+ if not reply or not reply.strip():
189
+ return True
190
+ low_phrases = ["i can help with that", "i'm here to help", "let me know", "i don't have"]
191
+ reply_l = reply.lower()
192
+ if any(phrase in reply_l for phrase in low_phrases):
193
+ # if reply only contains such phrase or is very short -> low quality
194
+ if word_count(reply) < 8:
195
+ return True
196
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  # -------------------------
199
+ # Streaming generator with adaptive direct-route quality checks:
 
 
 
200
  # -------------------------
201
  async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
202
  try:
 
228
  logger.exception("Flow analysis failed: %s", e)
229
  flow_context = {}
230
 
231
+ # compute vibe + plan requirements BEFORE routing so direct route knows min_words
232
+ try:
233
+ vibe_block = get_smart_context(last_user_msg)
234
+ except Exception:
235
+ vibe_block = ""
236
+ plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
237
+ min_words = plan_req.get("min_words", 30)
238
+
239
  # Log route decision
240
  route = flow_context.get("route", "planning")
241
+ complexity_score = float(flow_context.get("complexity_score", 0.0) or 0.0)
242
+ logger.info("Flow route: %s (score=%s) min_words=%s", route, complexity_score, min_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
+ # ---------- DIRECT / FAST PATH (adaptive) ----------
 
 
 
 
 
 
 
 
 
 
245
  if route == "direct":
246
+ yield f"data: {json.dumps({'status': 'Routing: direct (fast-path) - generating (adaptive)...'})}\n\n"
247
+ # Compose a compact system prompt but include vibe/time/strategy hints
 
248
  base_system_instruction = (
249
  "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
250
+ "For short/simple user requests, prefer concise, accurate responses. Avoid chain-of-thought. "
251
+ "If user seems to expect a longer answer, expand within the allowed min_words guidance."
252
  )
 
 
253
  time_data = get_time_context()
254
+ # prefer lower randomness but allow expansion on retry if needed
255
+ strategy_data = get_thinking_strategy(is_complex=False, detail=False, min_words_hint=min_words)
256
 
257
+ final_system_prompt = f"{base_system_instruction}\n{vibe_block}\n{time_data}\n{strategy_data}"
 
258
  if messages and messages[0].get("role") == "system":
259
  messages[0]["content"] = final_system_prompt
260
  else:
261
  messages.insert(0, {"role":"system","content": final_system_prompt})
262
 
263
+ # Decide attempts adaptively: if complexity_score extremely low -> 1 attempt; otherwise allow 2
264
+ max_attempts = 1 if complexity_score <= 0.12 else 2
265
+
266
+ # Slightly reduce temperature for direct replies to increase stability
267
+ orig_temperature = temperature
268
+ temperature = min(temperature, 0.72)
269
+
270
+ # Web search still allowed if intent asks
271
  tool_data_struct = None
272
  if intent == "internet_search":
273
  yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
 
294
  web_block += "No results found."
295
  messages.insert(1, {"role":"assistant","content": web_block})
296
 
 
297
  if tokenizer is None or model is None:
298
  err = "Model not loaded. Check server logs."
299
  payload = json.dumps({"choices":[{"delta":{"content": err}}]})
 
301
  yield "data: [DONE]\n\n"
302
  return
303
 
304
+ # prepare prompt
305
  try:
306
  if hasattr(tokenizer, "apply_chat_template"):
307
  text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
313
  attempts = 0
314
  generated_text = ""
315
  last_meta = {}
 
316
  while attempts < max_attempts:
317
  attempts += 1
318
+ yield f"data: {json.dumps({'status': f'Generating LLM (direct) attempt {attempts}...'})}\n\n"
319
  await asyncio.sleep(0.04)
320
+
321
  model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
322
 
323
  def sync_generate():
 
352
  plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
353
  wc = word_count(cleaned_body)
354
  last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
355
+
356
+ # quality checks: length vs min_words and generic-lowness
357
+ low_quality = _is_low_quality(cleaned_body)
358
+ needs_expand = (wc < min_words) or low_quality
359
+
360
+ if not needs_expand or attempts >= max_attempts:
361
+ generated_text = cleaned_body
362
+ if plan_label:
363
+ generated_text = plan_label + "\n\n" + generated_text
364
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  else:
366
+ # Prepare a concise expansion note and increase temperature a bit to allow more content
367
+ expand_note = f"\n\nEXPAND: The user's request expects around {min_words} words. Provide a fuller helpful answer without chain-of-thought. Keep it structured and concise."
368
+ if messages and messages[0].get("role") == "system":
369
+ messages[0]["content"] = messages[0]["content"] + "\n" + expand_note
370
+ else:
371
+ messages.insert(0, {"role":"system","content": expand_note})
372
+ # increase temperature to encourage more content on retry, but cap
373
+ temperature = min(orig_temperature + 0.08, 0.95)
374
+ # rebuild prompt for retry
375
+ try:
376
+ if hasattr(tokenizer, "apply_chat_template"):
377
+ text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
378
+ else:
379
+ text_prompt = _build_prompt_from_messages(messages)
380
+ except Exception:
381
+ text_prompt = _build_prompt_from_messages(messages)
382
+ await asyncio.sleep(0.02)
383
+ continue
384
 
385
+ payload = json.dumps({
386
+ "choices":[{"delta":{"content": generated_text}}],
387
+ "generation_attempts": attempts,
388
+ "last_attempt_meta": last_meta,
389
+ "route": "direct",
390
+ "complexity_score": complexity_score
391
+ })
392
+ yield f"data: {payload}\n\n"
393
  yield "data: [DONE]\n\n"
394
  return
 
 
 
 
 
 
 
 
 
 
395
 
396
+ # ---------- PLANNING ROUTE (complex) ----------
397
+ yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
398
+ await asyncio.sleep(0.15)
 
 
 
 
 
 
 
 
399
 
400
+ # planning work (vibe_block and plan_req already computed)
 
 
 
 
 
 
 
 
 
 
 
 
401
  min_words = plan_req["min_words"]
402
  strictness = plan_req["strictness"]
403
 
 
 
 
404
  if strictness:
405
+ temperature = min(temperature + 0.05, 0.95)
406
+ max_tokens = max(max_tokens, min_words // 2 + 120)
407
 
408
  strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
409
  time_data = get_time_context()
 
418
  )
419
 
420
  flow_desc = ""
421
+ if flow_context:
422
+ label = flow_context.get("flow_label","unknown")
423
+ conf = round(float(flow_context.get("confidence", 0.0)), 2)
424
+ expl = flow_context.get("explanation", "")
425
  flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
426
 
427
  final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
428
 
429
+ if messages and messages[0].get("role") == "system":
430
+ messages[0]["content"] = final_system_prompt
431
  else:
432
+ messages.insert(0, {"role":"system","content": final_system_prompt})
433
 
434
  # web search if needed
435
  tool_data_struct = None
436
  if intent == "internet_search":
437
+ yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
438
+ await asyncio.sleep(0)
439
  try:
440
+ tool_data_struct = perform_web_search(last_user_msg)
441
  except Exception as e:
442
  logger.exception("Web search failed: %s", e)
443
+ tool_data_struct = {"query": last_user_msg, "results": []}
444
 
445
  if tool_data_struct:
446
  web_block = "### WEB_DATA (from live search) ###\n"
 
456
  web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
457
  else:
458
  web_block += "No results found."
459
+ messages.insert(1, {"role":"assistant","content": web_block})
460
 
461
  if tokenizer is None or model is None:
462
+ err = "Model not loaded. Check server logs."
463
+ payload = json.dumps({"choices":[{"delta":{"content": err}}]})
464
+ yield f"data: {payload}\n\n"
465
+ yield "data: [DONE]\n\n"
466
+ return
467
 
468
  try:
469
  if hasattr(tokenizer, "apply_chat_template"):
470
+ text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
471
  else:
472
+ text_prompt = _build_prompt_from_messages(messages)
473
  except Exception:
474
+ text_prompt = _build_prompt_from_messages(messages)
475
+
476
+ # ---------- GENERATION STAGE ----------
477
+ max_attempts = 2
478
+ attempts = 0
479
+ last_meta = {}
480
+ generated_text = ""
481
+ while attempts < max_attempts:
482
+ attempts += 1
483
+ yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...'})}\n\n"
484
+ await asyncio.sleep(0.06)
485
+
486
+ model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
487
+
488
+ def sync_generate():
489
  return model.generate(
490
+ **model_inputs,
491
+ max_new_tokens=max_tokens,
492
+ temperature=temperature,
493
  do_sample=True,
494
  top_k=50,
495
  top_p=0.92,
496
  repetition_penalty=1.08
497
  )
498
  try:
499
+ generated_ids = await asyncio.to_thread(sync_generate)
500
  except RuntimeError as e:
501
  logger.exception("Generation failed (possible OOM): %s", e)
502
+ err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
503
+ yield f"data: {err_payload}\n\n"
504
+ yield "data: [DONE]\n\n"
505
+ return
506
 
507
+ input_len = model_inputs["input_ids"].shape[1]
508
+ new_tokens = generated_ids[0][input_len:]
509
+ raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
510
+ cleaned = safe_replace_providers(raw_response)
511
 
512
  forbidden = ["I am a human","I have a physical body","I am alive"]
513
  for fc in forbidden:
514
+ if fc.lower() in cleaned.lower():
515
+ cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
516
 
517
+ plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
518
+ wc = word_count(cleaned_body)
519
+ last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
520
 
521
+ if wc >= min_words or attempts >= max_attempts or plan_req["strictness"] == 0:
522
+ generated_text = cleaned_body
523
+ if plan_label:
524
+ generated_text = plan_label + "\n\n" + generated_text
525
  break
526
  else:
527
+ expand_note = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
528
+ if messages and messages[0].get("role") == "system":
529
+ messages[0]["content"] = messages[0]["content"] + "\n" + expand_note
530
  else:
531
+ messages.insert(0, {"role":"system","content": expand_note})
532
+ temperature = min(temperature + 0.07, 0.98)
533
  try:
534
  if hasattr(tokenizer, "apply_chat_template"):
535
+ text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
536
  else:
537
+ text_prompt = _build_prompt_from_messages(messages)
538
  except Exception:
539
+ text_prompt = _build_prompt_from_messages(messages)
540
  await asyncio.sleep(0.02)
541
  continue
542
 
543
+ if not generated_text:
544
+ plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
545
+ generated_text = (plan_label + "\n\n" if plan_label else "") + (cleaned_body or cleaned)
546
 
547
+ generated_text = re.sub(r"\bPlan\s*:\s*$", "", generated_text, flags=re.IGNORECASE).strip()
548
+ generated_text = generated_text.replace("I can help with that.", "I can help with that — let me explain. 🙂")
549
 
550
+ payload = json.dumps({
551
+ "choices":[{"delta":{"content": generated_text}}],
552
+ "generation_attempts": attempts,
553
+ "last_attempt_meta": last_meta,
554
+ "route": route,
555
+ "complexity_score": complexity_score
556
+ })
557
+ yield f"data: {payload}\n\n"
558
+ yield "data: [DONE]\n\n"
559
+ return
560
 
561
+ except asyncio.CancelledError:
562
+ logger.warning("Streaming cancelled.")
563
+ return
564
  except Exception as e:
565
+ logger.exception(f"Generator error: {e}")
566
+ err_payload = json.dumps({"choices":[{"delta":{"content": f"Internal error: {e}"}}]})
567
+ try:
568
+ yield f"data: {err_payload}\n\n"
569
+ yield "data: [DONE]\n\n"
570
+ except Exception:
571
+ return
572
 
573
  # -------------------------
574
  # Endpoints
 
615
 
616
  if __name__ == "__main__":
617
  import uvicorn
618
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))