Spaces:

Nexari-Research
/

Nexari-Server

Sleeping

App Files Files Community

Nexari-Research commited on 16 days ago

Commit

cc35d21

verified ·

1 Parent(s): abad315

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -421

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# app.py - UPDATED (smart hybrid: fast-path + quality-checked fallback to planning)
-# Behavior: use behavior_model.route to fast-path; after fast attempt run quality checks;
-# if quality low -> fallback to planning route automatically (one fallback only per request).
 import re
 import json
 import asyncio
@@ -186,58 +184,19 @@ def extract_and_sanitize_plan(text: str, max_plan_chars: int = 240) -> (str, str
             return plan_label, cleaned_body
     return None, text
-# -------------------------
-# Heuristic quality check for fast-path responses
-# -------------------------
-_LOW_QUALITY_PHRASES = [
-    "i'm here to help", "is there something specific", "i can help with that", "let me know",
-    "do you want", "what would you like", "please clarify", "sorry, i don't", "i don't have"
-]
-def _is_low_quality_text(text: str, min_words_hint: int) -> (bool, Dict[str,Any]):
-    """
-    Returns (is_low_quality, debug_info)
-    Heuristics:
-      - If word count < min_words_hint => low quality
-      - If response starts with generic short filler phrases => low quality
-      - If too short (<6 words) => low quality
-      - If contains many placeholders like 'I don't know' or 'sorry' => low quality
-    """
-    t = (text or "").strip()
-    wc = word_count(t)
-    lower = t.lower()
-    reasons = []
-    if wc < max(6, min_words_hint // 2):
-        reasons.append(f"word_count_too_small ({wc} < {max(6, min_words_hint // 2)})")
-    if wc < min_words_hint:
-        # not strict failure for very small min_words_hint (like 6), but flagged
-        reasons.append(f"below_min_hint ({wc} < {min_words_hint})")
-    for ph in _LOW_QUALITY_PHRASES:
-        if lower.startswith(ph):
-            reasons.append(f"starts_with_generic_phrase ({ph})")
-            break
-    # placeholder detection
-    placeholders = ["i don't know", "i'm not sure", "i do not know", "can't help", "unable to"]
-    for ph in placeholders:
-        if ph in lower:
-            reasons.append(f"contains_placeholder ({ph})")
-            break
-    # if many short sentences like "Okay. Sure." count as low quality
-    sent_count = len(re.findall(r"[.!?]+", t)) or 1
-    if wc < 12 and sent_count >= 2:
-        reasons.append("fragmented_short_sentences")
-    is_low = len(reasons) > 0
-    debug = {"word_count": wc, "min_words_hint": min_words_hint, "reasons": reasons}
-    return is_low, debug
 # -------------------------
-# Streaming generator with smart fallback:
-# - Fast-path tries one attempt
-# - If quality low -> fallback to planning route
-# - Avoid infinite loops with fallback_once flag
 # -------------------------
 async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
     try:
@@ -269,207 +228,46 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
             logger.exception("Flow analysis failed: %s", e)
             flow_context = {}
         # Log route decision
         route = flow_context.get("route", "planning")
-        complexity_score = flow_context.get("complexity_score", None)
-        logger.info("Flow route: %s (score=%s)", route, complexity_score)
-        # allow fallback once from direct -> planning
-        fallback_once = False
-        # Helper to run the planning route (reusable)
-        async def run_planning_route(messages_local, flow_context_local, last_user_msg_local, min_tokens_hint=None):
-            """
-            Runs the full planning pipeline (same as previous planning branch).
-            Returns generated_text and meta dict.
-            """
-            # Emit Reasoning indicator BEFORE heavy planning so UI shows it during planning
-            yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
-            await asyncio.sleep(0.12)
-            vibe_block = get_smart_context(last_user_msg_local)
-            plan_req = plan_response_requirements(messages_local, last_user_msg_local, flow_context_local, vibe_block)
-            min_words = plan_req["min_words"]
-            strictness = plan_req["strictness"]
-            # adjust tokens/temperature if strict
-            nonlocal temperature, max_tokens
-            if strictness:
-                temperature = min(temperature + 0.05, 0.95)
-                max_tokens = max(max_tokens, min_words // 2 + 120)
-            strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
-            time_data = get_time_context()
-            base_system_instruction = (
-                "### SYSTEM IDENTITY ###\n"
-                "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
-                "### RULES ###\n"
-                "1) If WEB_DATA is provided, prioritize it and cite sources.\n"
-                "2) Avoid chain-of-thought exposure. If requested to provide a short 'Plan', keep it concise (max 2 lines) and label it '🧠 Plan:'.\n"
-                "3) Use natural phrasing; follow emoji & verbosity guidance below.\n"
-            )
-            flow_desc = ""
-            if flow_context_local:
-                label = flow_context_local.get("flow_label","unknown")
-                conf = round(float(flow_context_local.get("confidence", 0.0)), 2)
-                expl = flow_context_local.get("explanation", "")
-                flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
-            final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
-            if messages_local and messages_local[0].get("role") == "system":
-                messages_local[0]["content"] = final_system_prompt
-            else:
-                messages_local.insert(0, {"role":"system","content": final_system_prompt})
-            # web search if needed
-            tool_data_struct = None
-            if intent == "internet_search":
-                yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
-                await asyncio.sleep(0)
-                try:
-                    tool_data_struct = perform_web_search(last_user_msg_local)
-                except Exception as e:
-                    logger.exception("Web search failed: %s", e)
-                    tool_data_struct = {"query": last_user_msg_local, "results": []}
-            if tool_data_struct:
-                web_block = "### WEB_DATA (from live search) ###\n"
-                items = tool_data_struct.get("results", [])
-                if items:
-                    lines = []
-                    for idx, it in enumerate(items, start=1):
-                        title = it.get("title","(no title)").strip()
-                        snippet = it.get("snippet","").replace("\n"," ").strip()
-                        url = it.get("url","")
-                        lines.append(f"{idx}. {title}\n   {snippet}\n   SOURCE: {url}")
-                    web_block += "\n".join(lines)
-                    web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
-                else:
-                    web_block += "No results found."
-                messages_local.insert(1, {"role":"assistant","content": web_block})
-            if tokenizer is None or model is None:
-                err = "Model not loaded. Check server logs."
-                payload = json.dumps({"choices":[{"delta":{"content": err}}]})
-                yield f"data: {payload}\n\n"
-                yield "data: [DONE]\n\n"
-                return None, {"error":"model_not_loaded"}
-            try:
-                if hasattr(tokenizer, "apply_chat_template"):
-                    text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
-                else:
-                    text_prompt_local = _build_prompt_from_messages(messages_local)
-            except Exception:
-                text_prompt_local = _build_prompt_from_messages(messages_local)
-            # ---------- GENERATION STAGE ----------
-            max_attempts_local = 2
-            attempts_local = 0
-            last_meta_local = {}
-            generated_text_local = ""
-            cleaned_local = ""
-            while attempts_local < max_attempts_local:
-                attempts_local += 1
-                # Emit explicit generating label (after planning completed)
-                yield f"data: {json.dumps({'status': f'Generating LLM ({attempts_local})...'})}\n\n"
-                await asyncio.sleep(0.06)
-                model_inputs_local = tokenizer(text_prompt_local, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
-                def sync_generate_local():
-                    return model.generate(
-                        **model_inputs_local,
-                        max_new_tokens=max_tokens,
-                        temperature=temperature,
-                        do_sample=True,
-                        top_k=50,
-                        top_p=0.92,
-                        repetition_penalty=1.08
-                    )
-                try:
-                    generated_ids_local = await asyncio.to_thread(sync_generate_local)
-                except RuntimeError as e:
-                    logger.exception("Generation failed (possible OOM): %s", e)
-                    err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
-                    yield f"data: {err_payload}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return None, {"error":"generation_failed"}
-                input_len_local = model_inputs_local["input_ids"].shape[1]
-                new_tokens_local = generated_ids_local[0][input_len_local:]
-                raw_response_local = tokenizer.decode(new_tokens_local, skip_special_tokens=True).strip()
-                cleaned_local = safe_replace_providers(raw_response_local)
-                forbidden = ["I am a human","I have a physical body","I am alive"]
-                for fc in forbidden:
-                    if fc.lower() in cleaned_local.lower():
-                        cleaned_local = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_local, flags=re.IGNORECASE)
-                plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
-                wc_local = word_count(cleaned_body_local)
-                last_meta_local = {"attempt": attempts_local, "word_count": wc_local, "raw_len": len(cleaned_body_local)}
-                if wc_local >= min_words or attempts_local >= max_attempts_local or plan_req["strictness"] == 0:
-                    generated_text_local = cleaned_body_local
-                    if plan_label_local:
-                        generated_text_local = plan_label_local + "\n\n" + generated_text_local
-                    break
-                else:
-                    expand_note_local = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
-                    if messages_local and messages_local[0].get("role") == "system":
-                        messages_local[0]["content"] = messages_local[0]["content"] + "\n" + expand_note_local
-                    else:
-                        messages_local.insert(0, {"role":"system","content": expand_note_local})
-                    temperature = min(temperature + 0.07, 0.98)
-                    try:
-                        if hasattr(tokenizer, "apply_chat_template"):
-                            text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
-                        else:
-                            text_prompt_local = _build_prompt_from_messages(messages_local)
-                    except Exception:
-                        text_prompt_local = _build_prompt_from_messages(messages_local)
-                    await asyncio.sleep(0.02)
-                    continue
-            if not generated_text_local:
-                plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
-                generated_text_local = (plan_label_local + "\n\n" if plan_label_local else "") + (cleaned_body_local or cleaned_local)
-            generated_text_local = re.sub(r"\bPlan\s*:\s*$", "", generated_text_local, flags=re.IGNORECASE).strip()
-            generated_text_local = generated_text_local.replace("I can help with that.", "I can help with that — let me explain. 🙂")
-            meta_local = {"generation_attempts": attempts_local, "last_attempt_meta": last_meta_local, "route": "planning", "complexity_score": flow_context_local.get("complexity_score")}
-            return generated_text_local, meta_local
-        # If direct route -> take fast-path (skip heavy planning UI status) but perform quality check
         if route == "direct":
-            # provide explicit SSE status with route info
-            yield f"data: {json.dumps({'status': 'Routing: direct (fast-path) - Generating...'})}\n\n"
-            # Build a compact system prompt to keep responses concise
             base_system_instruction = (
                 "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
-                "Respond concisely and directly for short/simple user requests. "
-                "Avoid chain-of-thought. Keep reply helpful and to the point."
             )
-            # Minimal strategy/time insertion to avoid heavy planning
             time_data = get_time_context()
-            # Keep a concise strategy
-            strategy_data = get_thinking_strategy(is_complex=False, detail=False, min_words_hint=12)
-            final_system_prompt = f"{base_system_instruction}\n{time_data}\n{strategy_data}"
-            # ensure system message is present
             if messages and messages[0].get("role") == "system":
                 messages[0]["content"] = final_system_prompt
             else:
                 messages.insert(0, {"role":"system","content": final_system_prompt})
-            # For direct route we use only 1 attempt to minimize latency
-            max_attempts = 1
             tool_data_struct = None
             if intent == "internet_search":
                 yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
@@ -496,7 +294,6 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                     web_block += "No results found."
                 messages.insert(1, {"role":"assistant","content": web_block})
-            # Proceed to generation stage (fast path)
             if tokenizer is None or model is None:
                 err = "Model not loaded. Check server logs."
                 payload = json.dumps({"choices":[{"delta":{"content": err}}]})
@@ -504,6 +301,7 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                 yield "data: [DONE]\n\n"
                 return
             try:
                 if hasattr(tokenizer, "apply_chat_template"):
                     text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -515,11 +313,11 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
             attempts = 0
             generated_text = ""
             last_meta = {}
-            cleaned = ""
             while attempts < max_attempts:
                 attempts += 1
-                yield f"data: {json.dumps({'status': f'Generating LLM (fast-path) ({attempts})...'})}\n\n"
                 await asyncio.sleep(0.04)
                 model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
                 def sync_generate():
@@ -554,132 +352,58 @@ async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600
                 plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
                 wc = word_count(cleaned_body)
                 last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
-                generated_text = cleaned_body
-                if plan_label:
-                    generated_text = plan_label + "\n\n" + generated_text
-                # end fast attempt
-            # Quality check for fast-path output
-            # Use min_words_hint derived from simple vibe/context
-            vibe_block_fast = get_smart_context(last_user_msg)
-            min_words_hint_fast = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block_fast)["min_words"]
-            is_low_quality, debug_info = _is_low_quality_text(generated_text, min_words_hint_fast)
-            logger.info("Fast-path quality check: low=%s debug=%s", is_low_quality, debug_info)
-            if not is_low_quality:
-                payload = json.dumps({
-                    "choices":[{"delta":{"content": generated_text}}],
-                    "generation_attempts": attempts,
-                    "last_attempt_meta": last_meta,
-                    "route": "direct",
-                    "complexity_score": complexity_score,
-                    "quality_debug": debug_info
-                })
-                yield f"data: {payload}\n\n"
-                yield "data: [DONE]\n\n"
-                return
-            else:
-                # fallback to planning route once
-                fallback_once = True
-                yield f"data: {json.dumps({'status': 'Fast result low-quality; falling back to planner...'})}\n\n"
-                await asyncio.sleep(0.05)
-                # prepare a new messages copy to avoid polluting original (remove prior system if present)
-                messages_for_planning = [m.copy() for m in messages if m.get("role") != "system"]
-                # re-insert the user's last message and preserved assistant web block if any
-                # Insert original earlier context (we'll reconstruct system prompt in planning function)
-                # Note: flow_context remains the same
-                # Call planning route generator
-                planning_gen = run_planning_route(messages_for_planning, flow_context, last_user_msg)
-                # run the async generator and return its results
-                # planning_gen is an async generator; iterate and yield any interim SSE from it
-                planning_result_text = None
-                planning_result_meta = None
-                async for item in planning_gen:
-                    # item can be SSE strings from run_planning_route; yield them through
-                    yield item
-                # run_planning_route returns via its final return - but since it's a generator
-                # we arranged it to yield statuses and then return result via yielded payload below.
-                # To keep implementation simple, re-run a synchronous planning helper to get final result:
-                generated_text_planning, meta_planning = await _run_planning_sync(messages_for_planning, flow_context, last_user_msg)
-                if generated_text_planning is None:
-                    # planning failed; fallback to fast text (even if low quality)
-                    payload = json.dumps({
-                        "choices":[{"delta":{"content": generated_text}}],
-                        "generation_attempts": attempts,
-                        "last_attempt_meta": last_meta,
-                        "route": "direct_fallback_failed",
-                        "complexity_score": complexity_score,
-                        "quality_debug": debug_info
-                    })
-                    yield f"data: {payload}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return
                 else:
-                    payload = json.dumps({
-                        "choices":[{"delta":{"content": generated_text_planning}}],
-                        "generation_attempts": meta_planning.get("generation_attempts"),
-                        "last_attempt_meta": meta_planning.get("last_attempt_meta"),
-                        "route": "planning_after_fast_fallback",
-                        "complexity_score": complexity_score,
-                        "quality_debug": debug_info
-                    })
-                    yield f"data: {payload}\n\n"
-                    yield "data: [DONE]\n\n"
-                    return
-        # If not direct, or fallback not triggered, go full planning route:
-        # We'll call a synchronous helper to produce planning response content and meta.
-        planning_result_text, planning_meta = await _run_planning_sync(messages, flow_context, last_user_msg)
-        if planning_result_text is None:
-            err_payload = json.dumps({"choices":[{"delta":{"content":"Internal error: planning generation failed"}}]})
-            yield f"data: {err_payload}\n\n"
             yield "data: [DONE]\n\n"
             return
-        payload = json.dumps({
-            "choices":[{"delta":{"content": planning_result_text}}],
-            "generation_attempts": planning_meta.get("generation_attempts"),
-            "last_attempt_meta": planning_meta.get("last_attempt_meta"),
-            "route": "planning",
-            "complexity_score": complexity_score
-        })
-        yield f"data: {payload}\n\n"
-        yield "data: [DONE]\n\n"
-        return
-    except asyncio.CancelledError:
-        logger.warning("Streaming cancelled.")
-        return
-    except Exception as e:
-        logger.exception(f"Generator error: {e}")
-        err_payload = json.dumps({"choices":[{"delta":{"content": f"Internal error: {e}"}}]})
-        try:
-            yield f"data: {err_payload}\n\n"
-            yield "data: [DONE]\n\n"
-        except Exception:
-            return
-# -------------------------
-# Synchronous planning wrapper used by the async flow to avoid duplicating code.
-# We keep it as an async function that executes the planning generation synchronously
-# to return final text+meta for simplified control flow.
-# -------------------------
-async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_local):
-    """
-    Runs the planning generator synchronously and returns (generated_text, meta_dict)
-    This re-uses the planning logic but in a simpler callable form (non-streaming).
-    """
-    try:
-        vibe_block = get_smart_context(last_user_msg_local)
-        plan_req = plan_response_requirements(messages_local, last_user_msg_local, flow_context_local, vibe_block)
         min_words = plan_req["min_words"]
         strictness = plan_req["strictness"]
-        # adjust tokens/temperature if strict
-        temp_local = temperature
-        max_tok_local = max_tokens
         if strictness:
-            temp_local = min(temp_local + 0.05, 0.95)
-            max_tok_local = max(max_tok_local, min_words // 2 + 120)
         strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
         time_data = get_time_context()
@@ -694,27 +418,29 @@ async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_l
         )
         flow_desc = ""
-        if flow_context_local:
-            label = flow_context_local.get("flow_label","unknown")
-            conf = round(float(flow_context_local.get("confidence", 0.0)), 2)
-            expl = flow_context_local.get("explanation", "")
             flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
         final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
-        if messages_local and messages_local[0].get("role") == "system":
-            messages_local[0]["content"] = final_system_prompt
         else:
-            messages_local.insert(0, {"role":"system","content": final_system_prompt})
         # web search if needed
         tool_data_struct = None
         if intent == "internet_search":
             try:
-                tool_data_struct = perform_web_search(last_user_msg_local)
             except Exception as e:
                 logger.exception("Web search failed: %s", e)
-                tool_data_struct = {"query": last_user_msg_local, "results": []}
         if tool_data_struct:
             web_block = "### WEB_DATA (from live search) ###\n"
@@ -730,94 +456,119 @@ async def _run_planning_sync(messages_local, flow_context_local, last_user_msg_l
                 web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
             else:
                 web_block += "No results found."
-            messages_local.insert(1, {"role":"assistant","content": web_block})
         if tokenizer is None or model is None:
-            return None, {"error":"model_not_loaded"}
         try:
             if hasattr(tokenizer, "apply_chat_template"):
-                text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
             else:
-                text_prompt_local = _build_prompt_from_messages(messages_local)
         except Exception:
-            text_prompt_local = _build_prompt_from_messages(messages_local)
-        # generation (planning)
-        max_attempts_local = 2
-        attempts_local = 0
-        last_meta_local = {}
-        generated_text_local = ""
-        cleaned_local = ""
-        while attempts_local < max_attempts_local:
-            attempts_local += 1
-            model_inputs_local = tokenizer(text_prompt_local, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
-            def sync_generate_local():
                 return model.generate(
-                    **model_inputs_local,
-                    max_new_tokens=max_tok_local,
-                    temperature=temp_local,
                     do_sample=True,
                     top_k=50,
                     top_p=0.92,
                     repetition_penalty=1.08
                 )
             try:
-                generated_ids_local = await asyncio.to_thread(sync_generate_local)
             except RuntimeError as e:
                 logger.exception("Generation failed (possible OOM): %s", e)
-                return None, {"error":"generation_failed"}
-            input_len_local = model_inputs_local["input_ids"].shape[1]
-            new_tokens_local = generated_ids_local[0][input_len_local:]
-            raw_response_local = tokenizer.decode(new_tokens_local, skip_special_tokens=True).strip()
-            cleaned_local = safe_replace_providers(raw_response_local)
             forbidden = ["I am a human","I have a physical body","I am alive"]
             for fc in forbidden:
-                if fc.lower() in cleaned_local.lower():
-                    cleaned_local = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned_local, flags=re.IGNORECASE)
-            plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
-            wc_local = word_count(cleaned_body_local)
-            last_meta_local = {"attempt": attempts_local, "word_count": wc_local, "raw_len": len(cleaned_body_local)}
-            if wc_local >= min_words or attempts_local >= max_attempts_local or plan_req["strictness"] == 0:
-                generated_text_local = cleaned_body_local
-                if plan_label_local:
-                    generated_text_local = plan_label_local + "\n\n" + generated_text_local
                 break
             else:
-                expand_note_local = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
-                if messages_local and messages_local[0].get("role") == "system":
-                    messages_local[0]["content"] = messages_local[0]["content"] + "\n" + expand_note_local
                 else:
-                    messages_local.insert(0, {"role":"system","content": expand_note_local})
-                temp_local = min(temp_local + 0.07, 0.98)
                 try:
                     if hasattr(tokenizer, "apply_chat_template"):
-                        text_prompt_local = tokenizer.apply_chat_template(messages_local, tokenize=False, add_generation_prompt=True)
                     else:
-                        text_prompt_local = _build_prompt_from_messages(messages_local)
                 except Exception:
-                    text_prompt_local = _build_prompt_from_messages(messages_local)
                 await asyncio.sleep(0.02)
                 continue
-        if not generated_text_local:
-            plan_label_local, cleaned_body_local = extract_and_sanitize_plan(cleaned_local, max_plan_chars=240)
-            generated_text_local = (plan_label_local + "\n\n" if plan_label_local else "") + (cleaned_body_local or cleaned_local)
-        generated_text_local = re.sub(r"\bPlan\s*:\s*$", "", generated_text_local, flags=re.IGNORECASE).strip()
-        generated_text_local = generated_text_local.replace("I can help with that.", "I can help with that — let me explain. 🙂")
-        meta_local = {"generation_attempts": attempts_local, "last_attempt_meta": last_meta_local}
-        return generated_text_local, meta_local
     except Exception as e:
-        logger.exception("Planning sync error: %s", e)
-        return None, {"error":"planning_exception"}
 # -------------------------
 # Endpoints
@@ -864,4 +615,4 @@ except Exception as e:
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

+# app.py - UPDATED (adaptive direct-route for quality + planning-route unchanged)
 import re
 import json
 import asyncio
             return plan_label, cleaned_body
     return None, text
+def _is_low_quality(reply: str) -> bool:
+    if not reply or not reply.strip():
+        return True
+    low_phrases = ["i can help with that", "i'm here to help", "let me know", "i don't have"]
+    reply_l = reply.lower()
+    if any(phrase in reply_l for phrase in low_phrases):
+        # if reply only contains such phrase or is very short -> low quality
+        if word_count(reply) < 8:
+            return True
+    return False
 # -------------------------
+# Streaming generator with adaptive direct-route quality checks:
 # -------------------------
 async def generate_response_stream(messages: List[Dict[str,str]], max_tokens=600, temperature=0.85):
     try:
             logger.exception("Flow analysis failed: %s", e)
             flow_context = {}
+        # compute vibe + plan requirements BEFORE routing so direct route knows min_words
+        try:
+            vibe_block = get_smart_context(last_user_msg)
+        except Exception:
+            vibe_block = ""
+        plan_req = plan_response_requirements(messages, last_user_msg, flow_context, vibe_block)
+        min_words = plan_req.get("min_words", 30)
         # Log route decision
         route = flow_context.get("route", "planning")
+        complexity_score = float(flow_context.get("complexity_score", 0.0) or 0.0)
+        logger.info("Flow route: %s (score=%s) min_words=%s", route, complexity_score, min_words)
+        # ---------- DIRECT / FAST PATH (adaptive) ----------
         if route == "direct":
+            yield f"data: {json.dumps({'status': 'Routing: direct (fast-path) - generating (adaptive)...'})}\n\n"
+            # Compose a compact system prompt but include vibe/time/strategy hints
             base_system_instruction = (
                 "You are Nexari G1, an expressive and helpful AI created by Piyush.\n"
+                "For short/simple user requests, prefer concise, accurate responses. Avoid chain-of-thought. "
+                "If user seems to expect a longer answer, expand within the allowed min_words guidance."
             )
             time_data = get_time_context()
+            # prefer lower randomness but allow expansion on retry if needed
+            strategy_data = get_thinking_strategy(is_complex=False, detail=False, min_words_hint=min_words)
+            final_system_prompt = f"{base_system_instruction}\n{vibe_block}\n{time_data}\n{strategy_data}"
             if messages and messages[0].get("role") == "system":
                 messages[0]["content"] = final_system_prompt
             else:
                 messages.insert(0, {"role":"system","content": final_system_prompt})
+            # Decide attempts adaptively: if complexity_score extremely low -> 1 attempt; otherwise allow 2
+            max_attempts = 1 if complexity_score <= 0.12 else 2
+            # Slightly reduce temperature for direct replies to increase stability
+            orig_temperature = temperature
+            temperature = min(temperature, 0.72)
+            # Web search still allowed if intent asks
             tool_data_struct = None
             if intent == "internet_search":
                 yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
                     web_block += "No results found."
                 messages.insert(1, {"role":"assistant","content": web_block})
             if tokenizer is None or model is None:
                 err = "Model not loaded. Check server logs."
                 payload = json.dumps({"choices":[{"delta":{"content": err}}]})
                 yield "data: [DONE]\n\n"
                 return
+            # prepare prompt
             try:
                 if hasattr(tokenizer, "apply_chat_template"):
                     text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             attempts = 0
             generated_text = ""
             last_meta = {}
             while attempts < max_attempts:
                 attempts += 1
+                yield f"data: {json.dumps({'status': f'Generating LLM (direct) attempt {attempts}...'})}\n\n"
                 await asyncio.sleep(0.04)
                 model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
                 def sync_generate():
                 plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
                 wc = word_count(cleaned_body)
                 last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
+                # quality checks: length vs min_words and generic-lowness
+                low_quality = _is_low_quality(cleaned_body)
+                needs_expand = (wc < min_words) or low_quality
+                if not needs_expand or attempts >= max_attempts:
+                    generated_text = cleaned_body
+                    if plan_label:
+                        generated_text = plan_label + "\n\n" + generated_text
+                    break
                 else:
+                    # Prepare a concise expansion note and increase temperature a bit to allow more content
+                    expand_note = f"\n\nEXPAND: The user's request expects around {min_words} words. Provide a fuller helpful answer without chain-of-thought. Keep it structured and concise."
+                    if messages and messages[0].get("role") == "system":
+                        messages[0]["content"] = messages[0]["content"] + "\n" + expand_note
+                    else:
+                        messages.insert(0, {"role":"system","content": expand_note})
+                    # increase temperature to encourage more content on retry, but cap
+                    temperature = min(orig_temperature + 0.08, 0.95)
+                    # rebuild prompt for retry
+                    try:
+                        if hasattr(tokenizer, "apply_chat_template"):
+                            text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                        else:
+                            text_prompt = _build_prompt_from_messages(messages)
+                    except Exception:
+                        text_prompt = _build_prompt_from_messages(messages)
+                    await asyncio.sleep(0.02)
+                    continue
+            payload = json.dumps({
+                "choices":[{"delta":{"content": generated_text}}],
+                "generation_attempts": attempts,
+                "last_attempt_meta": last_meta,
+                "route": "direct",
+                "complexity_score": complexity_score
+            })
+            yield f"data: {payload}\n\n"
             yield "data: [DONE]\n\n"
             return
+        # ---------- PLANNING ROUTE (complex) ----------
+        yield f"data: {json.dumps({'status': 'Reasoning (planner)...'})}\n\n"
+        await asyncio.sleep(0.15)
+        # planning work (vibe_block and plan_req already computed)
         min_words = plan_req["min_words"]
         strictness = plan_req["strictness"]
         if strictness:
+            temperature = min(temperature + 0.05, 0.95)
+            max_tokens = max(max_tokens, min_words // 2 + 120)
         strategy_data = get_thinking_strategy(is_complex=(intent=="coding_request" or min_words>50), detail=(min_words>50), min_words_hint=min_words)
         time_data = get_time_context()
         )
         flow_desc = ""
+        if flow_context:
+            label = flow_context.get("flow_label","unknown")
+            conf = round(float(flow_context.get("confidence", 0.0)), 2)
+            expl = flow_context.get("explanation", "")
             flow_desc = f"\n[FLOW] Detected: {label} (confidence {conf}). {expl}\n"
         final_system_prompt = f"{base_system_instruction}\n{flow_desc}\n{vibe_block}\n{time_data}\n{strategy_data}"
+        if messages and messages[0].get("role") == "system":
+            messages[0]["content"] = final_system_prompt
         else:
+            messages.insert(0, {"role":"system","content": final_system_prompt})
         # web search if needed
         tool_data_struct = None
         if intent == "internet_search":
+            yield f"data: {json.dumps({'status': 'Searching the web...'})}\n\n"
+            await asyncio.sleep(0)
             try:
+                tool_data_struct = perform_web_search(last_user_msg)
             except Exception as e:
                 logger.exception("Web search failed: %s", e)
+                tool_data_struct = {"query": last_user_msg, "results": []}
         if tool_data_struct:
             web_block = "### WEB_DATA (from live search) ###\n"
                 web_block += "\n---\nINSTRUCTION: Use the WEB_DATA above to answer; cite relevant source numbers inline."
             else:
                 web_block += "No results found."
+            messages.insert(1, {"role":"assistant","content": web_block})
         if tokenizer is None or model is None:
+            err = "Model not loaded. Check server logs."
+            payload = json.dumps({"choices":[{"delta":{"content": err}}]})
+            yield f"data: {payload}\n\n"
+            yield "data: [DONE]\n\n"
+            return
         try:
             if hasattr(tokenizer, "apply_chat_template"):
+                text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             else:
+                text_prompt = _build_prompt_from_messages(messages)
         except Exception:
+            text_prompt = _build_prompt_from_messages(messages)
+        # ---------- GENERATION STAGE ----------
+        max_attempts = 2
+        attempts = 0
+        last_meta = {}
+        generated_text = ""
+        while attempts < max_attempts:
+            attempts += 1
+            yield f"data: {json.dumps({'status': f'Generating LLM ({attempts})...'})}\n\n"
+            await asyncio.sleep(0.06)
+            model_inputs = tokenizer(text_prompt, return_tensors="pt", truncation=True, max_length=4096).to(next(model.parameters()).device)
+            def sync_generate():
                 return model.generate(
+                    **model_inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
                     do_sample=True,
                     top_k=50,
                     top_p=0.92,
                     repetition_penalty=1.08
                 )
             try:
+                generated_ids = await asyncio.to_thread(sync_generate)
             except RuntimeError as e:
                 logger.exception("Generation failed (possible OOM): %s", e)
+                err_payload = json.dumps({"choices":[{"delta":{"content": "Model generation failed due to resource limits."}}]})
+                yield f"data: {err_payload}\n\n"
+                yield "data: [DONE]\n\n"
+                return
+            input_len = model_inputs["input_ids"].shape[1]
+            new_tokens = generated_ids[0][input_len:]
+            raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+            cleaned = safe_replace_providers(raw_response)
             forbidden = ["I am a human","I have a physical body","I am alive"]
             for fc in forbidden:
+                if fc.lower() in cleaned.lower():
+                    cleaned = re.sub(re.escape(fc), "I am an AI — expressive and interactive.", cleaned, flags=re.IGNORECASE)
+            plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
+            wc = word_count(cleaned_body)
+            last_meta = {"attempt": attempts, "word_count": wc, "raw_len": len(cleaned_body)}
+            if wc >= min_words or attempts >= max_attempts or plan_req["strictness"] == 0:
+                generated_text = cleaned_body
+                if plan_label:
+                    generated_text = plan_label + "\n\n" + generated_text
                 break
             else:
+                expand_note = f"\n\nEXPAND: The user's request needs ~{min_words} words. Expand previous answer (concise style) and avoid chain-of-thought."
+                if messages and messages[0].get("role") == "system":
+                    messages[0]["content"] = messages[0]["content"] + "\n" + expand_note
                 else:
+                    messages.insert(0, {"role":"system","content": expand_note})
+                temperature = min(temperature + 0.07, 0.98)
                 try:
                     if hasattr(tokenizer, "apply_chat_template"):
+                        text_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                     else:
+                        text_prompt = _build_prompt_from_messages(messages)
                 except Exception:
+                    text_prompt = _build_prompt_from_messages(messages)
                 await asyncio.sleep(0.02)
                 continue
+        if not generated_text:
+            plan_label, cleaned_body = extract_and_sanitize_plan(cleaned, max_plan_chars=240)
+            generated_text = (plan_label + "\n\n" if plan_label else "") + (cleaned_body or cleaned)
+        generated_text = re.sub(r"\bPlan\s*:\s*$", "", generated_text, flags=re.IGNORECASE).strip()
+        generated_text = generated_text.replace("I can help with that.", "I can help with that — let me explain. 🙂")
+        payload = json.dumps({
+            "choices":[{"delta":{"content": generated_text}}],
+            "generation_attempts": attempts,
+            "last_attempt_meta": last_meta,
+            "route": route,
+            "complexity_score": complexity_score
+        })
+        yield f"data: {payload}\n\n"
+        yield "data: [DONE]\n\n"
+        return
+    except asyncio.CancelledError:
+        logger.warning("Streaming cancelled.")
+        return
     except Exception as e:
+        logger.exception(f"Generator error: {e}")
+        err_payload = json.dumps({"choices":[{"delta":{"content": f"Internal error: {e}"}}]})
+        try:
+            yield f"data: {err_payload}\n\n"
+            yield "data: [DONE]\n\n"
+        except Exception:
+            return
 # -------------------------
 # Endpoints
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))