Bi Yoo commited on
Commit
57dc0e4
·
1 Parent(s): 086f11b

update model

Browse files
Files changed (2) hide show
  1. app.py +7 -3
  2. config.py +4 -4
app.py CHANGED
@@ -395,8 +395,9 @@ def initialize_llm():
395
  if LLM_PROVIDER == "huggingface":
396
  # Will use requests for HF Inference API
397
  if not HUGGINGFACE_API_KEY:
398
- raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")
399
- print(f"Initialized HuggingFace Inference API with model: {HUGGINGFACE_MODEL}")
 
400
  elif LLM_PROVIDER == "local":
401
  ensure_llama_cpp_installed()
402
  try:
@@ -432,8 +433,11 @@ def initialize_llm():
432
  model_path=local_model_path,
433
  n_ctx=LOCAL_MODEL_CONTEXT_LENGTH,
434
  n_threads=LOCAL_MODEL_THREADS,
 
435
  n_batch=LOCAL_MODEL_BATCH_SIZE,
436
  n_gpu_layers=0,
 
 
437
  verbose=True, # Enable to see prompt formatting
438
  )
439
  except Exception as load_err:
@@ -525,7 +529,7 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
525
  temperature=0.3,
526
  top_p=0.7,
527
  repeat_penalty=1.3,
528
- stop=["<end_of_turn>", "</s>", "Question:", "Context:"],
529
  )
530
  except Exception as err:
531
  raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
 
395
  if LLM_PROVIDER == "huggingface":
396
  # Will use requests for HF Inference API
397
  if not HUGGINGFACE_API_KEY:
398
+ print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
399
+ else:
400
+ print(f"Initialized HuggingFace Inference API with model: {HUGGINGFACE_MODEL}")
401
  elif LLM_PROVIDER == "local":
402
  ensure_llama_cpp_installed()
403
  try:
 
433
  model_path=local_model_path,
434
  n_ctx=LOCAL_MODEL_CONTEXT_LENGTH,
435
  n_threads=LOCAL_MODEL_THREADS,
436
+ n_threads_batch=LOCAL_MODEL_THREADS, # Use all threads for batch processing
437
  n_batch=LOCAL_MODEL_BATCH_SIZE,
438
  n_gpu_layers=0,
439
+ use_mmap=True, # Memory-mapped file loading (faster, less RAM)
440
+ use_mlock=False, # Don't lock memory (not needed for HF Spaces)
441
  verbose=True, # Enable to see prompt formatting
442
  )
443
  except Exception as load_err:
 
529
  temperature=0.3,
530
  top_p=0.7,
531
  repeat_penalty=1.3,
532
+ stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
533
  )
534
  except Exception as err:
535
  raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
config.py CHANGED
@@ -15,11 +15,11 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
15
  HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
16
 
17
  # Local model configuration (for quantized models hosted within the Space)
18
- LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-1.7B-GGUF")
19
- LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-1.7B-Q4_K_M.gguf")
20
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
- LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 4)))
22
- LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "256"))
23
  LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25
 
 
15
  HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
16
 
17
  # Local model configuration (for quantized models hosted within the Space)
18
+ LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF")
19
+ LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
20
  LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
21
+ LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
22
+ LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "512")) # Increased for better throughput
23
  LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
24
  LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
25