Spaces:
Running
Running
Bi Yoo
commited on
Commit
·
57dc0e4
1
Parent(s):
086f11b
update model
Browse files
app.py
CHANGED
|
@@ -395,8 +395,9 @@ def initialize_llm():
|
|
| 395 |
if LLM_PROVIDER == "huggingface":
|
| 396 |
# Will use requests for HF Inference API
|
| 397 |
if not HUGGINGFACE_API_KEY:
|
| 398 |
-
|
| 399 |
-
|
|
|
|
| 400 |
elif LLM_PROVIDER == "local":
|
| 401 |
ensure_llama_cpp_installed()
|
| 402 |
try:
|
|
@@ -432,8 +433,11 @@ def initialize_llm():
|
|
| 432 |
model_path=local_model_path,
|
| 433 |
n_ctx=LOCAL_MODEL_CONTEXT_LENGTH,
|
| 434 |
n_threads=LOCAL_MODEL_THREADS,
|
|
|
|
| 435 |
n_batch=LOCAL_MODEL_BATCH_SIZE,
|
| 436 |
n_gpu_layers=0,
|
|
|
|
|
|
|
| 437 |
verbose=True, # Enable to see prompt formatting
|
| 438 |
)
|
| 439 |
except Exception as load_err:
|
|
@@ -525,7 +529,7 @@ def generate_response_local(system_prompt: str, user_prompt: str) -> str:
|
|
| 525 |
temperature=0.3,
|
| 526 |
top_p=0.7,
|
| 527 |
repeat_penalty=1.3,
|
| 528 |
-
stop=["
|
| 529 |
)
|
| 530 |
except Exception as err:
|
| 531 |
raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
|
|
|
|
| 395 |
if LLM_PROVIDER == "huggingface":
|
| 396 |
# Will use requests for HF Inference API
|
| 397 |
if not HUGGINGFACE_API_KEY:
|
| 398 |
+
print("WARNING: HUGGINGFACE_API_KEY not set - HuggingFace provider will fail at runtime")
|
| 399 |
+
else:
|
| 400 |
+
print(f"Initialized HuggingFace Inference API with model: {HUGGINGFACE_MODEL}")
|
| 401 |
elif LLM_PROVIDER == "local":
|
| 402 |
ensure_llama_cpp_installed()
|
| 403 |
try:
|
|
|
|
| 433 |
model_path=local_model_path,
|
| 434 |
n_ctx=LOCAL_MODEL_CONTEXT_LENGTH,
|
| 435 |
n_threads=LOCAL_MODEL_THREADS,
|
| 436 |
+
n_threads_batch=LOCAL_MODEL_THREADS, # Use all threads for batch processing
|
| 437 |
n_batch=LOCAL_MODEL_BATCH_SIZE,
|
| 438 |
n_gpu_layers=0,
|
| 439 |
+
use_mmap=True, # Memory-mapped file loading (faster, less RAM)
|
| 440 |
+
use_mlock=False, # Don't lock memory (not needed for HF Spaces)
|
| 441 |
verbose=True, # Enable to see prompt formatting
|
| 442 |
)
|
| 443 |
except Exception as load_err:
|
|
|
|
| 529 |
temperature=0.3,
|
| 530 |
top_p=0.7,
|
| 531 |
repeat_penalty=1.3,
|
| 532 |
+
stop=["<|im_end|>", "<|endoftext|>", "<think>"], # Qwen3 stop tokens + thinking
|
| 533 |
)
|
| 534 |
except Exception as err:
|
| 535 |
raise HTTPException(status_code=500, detail=f"Local model error: {err}") from err
|
config.py
CHANGED
|
@@ -15,11 +15,11 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
|
|
| 15 |
HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
|
| 16 |
|
| 17 |
# Local model configuration (for quantized models hosted within the Space)
|
| 18 |
-
LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-
|
| 19 |
-
LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-
|
| 20 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 21 |
-
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or
|
| 22 |
-
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "
|
| 23 |
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
|
| 24 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 25 |
|
|
|
|
| 15 |
HUGGINGFACE_MODEL = "google/gemma-2-2b-it"
|
| 16 |
|
| 17 |
# Local model configuration (for quantized models hosted within the Space)
|
| 18 |
+
LOCAL_MODEL_REPO = os.getenv("LOCAL_MODEL_REPO", "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF")
|
| 19 |
+
LOCAL_MODEL_FILENAME = os.getenv("LOCAL_MODEL_FILENAME", "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf") # Q4_K_M (2.50GB, recommended)
|
| 20 |
LOCAL_MODEL_CONTEXT_LENGTH = int(os.getenv("LOCAL_MODEL_CONTEXT_LENGTH", "2048"))
|
| 21 |
+
LOCAL_MODEL_THREADS = int(os.getenv("LOCAL_MODEL_THREADS", str(os.cpu_count() or 2))) # HF Spaces has 2 vCPUs
|
| 22 |
+
LOCAL_MODEL_BATCH_SIZE = int(os.getenv("LOCAL_MODEL_BATCH_SIZE", "512")) # Increased for better throughput
|
| 23 |
LOCAL_MODEL_MAX_OUTPUT_TOKENS = int(os.getenv("LOCAL_MODEL_MAX_OUTPUT_TOKENS", "200"))
|
| 24 |
LOCAL_MODEL_HF_TOKEN = os.getenv("LOCAL_MODEL_HF_TOKEN", HUGGINGFACE_API_KEY or "")
|
| 25 |
|