pasxalisag commited on
Commit
ebf3517
·
verified ·
1 Parent(s): cc9304c

Upload 6 files

Browse files
Files changed (4) hide show
  1. .gitattributes +32 -35
  2. Dockerfile +47 -40
  3. app.py +623 -630
  4. requirements.txt +2 -1
.gitattributes CHANGED
@@ -1,35 +1,32 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
 
 
 
Dockerfile CHANGED
@@ -1,40 +1,47 @@
1
- # Use Python 3.10 slim
2
- FROM python:3.10-slim
3
-
4
- WORKDIR /app
5
-
6
- # Install system dependencies
7
- RUN apt-get update && apt-get install -y \
8
- build-essential \
9
- git \
10
- curl \
11
- wget \
12
- cmake \
13
- libopenblas-dev \
14
- libomp-dev \
15
- && rm -rf /var/lib/apt/lists/*
16
-
17
- # Copy requirements and install Python dependencies
18
- COPY requirements.txt .
19
- RUN pip install --no-cache-dir -r requirements.txt
20
-
21
- # Download spaCy model
22
- RUN python -m spacy download en_core_web_sm
23
-
24
- # Copy all application files
25
- COPY . .
26
-
27
- # Create persistent directories for Hugging Face
28
- RUN mkdir -p /data/artifacts && mkdir -p /cache/models
29
-
30
- # Set environment variables
31
- ENV ARTIFACT_DIR=/data/artifacts \
32
- HF_HOME=/cache/models \
33
- TRANSFORMERS_CACHE=/cache/models \
34
- PYTHONUNBUFFERED=1
35
-
36
- # Expose Gradio port
37
- EXPOSE 7860
38
-
39
- # Run the application
40
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for FAISS, PyTorch, and spaCy
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ git \
9
+ curl \
10
+ wget \
11
+ cmake \
12
+ libopenblas-dev \
13
+ libomp-dev \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy requirements first for better caching
17
+ COPY requirements.txt .
18
+
19
+ # Install Python dependencies
20
+ RUN pip install --no-cache-dir -r requirements.txt \
21
+ && python -m spacy download en_core_web_sm
22
+
23
+ # Copy application code
24
+ COPY . .
25
+
26
+ # Create persistent storage directories for Hugging Face Spaces
27
+ RUN mkdir -p /data/artifacts \
28
+ && mkdir -p /cache/models \
29
+ && ln -sf /data/artifacts artifacts # Symlink to persistent storage
30
+
31
+ # Set environment variables for Hugging Face Spaces
32
+ ENV ARTIFACT_DIR=/data/artifacts \
33
+ HF_HOME=/cache/models \
34
+ TRANSFORMERS_CACHE=/cache/models \
35
+ GRADIO_SERVER_NAME=0.0.0.0 \
36
+ GRADIO_SERVER_PORT=7860 \
37
+ PYTHONUNBUFFERED=1
38
+
39
+ # Expose port for Gradio
40
+ EXPOSE 7860
41
+
42
+ # Health check
43
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
44
+ CMD python -c "import requests; requests.get('http://localhost:7860', timeout=10)"
45
+
46
+ # Run the application with pre-build step
47
+ CMD ["sh", "-c", "python build_hf.py && python app.py"]
app.py CHANGED
@@ -1,630 +1,623 @@
1
- """
2
- Codey Bryant 3.0 — SOTA RAG for Hugging Face Spaces
3
- Maintains EXACT same architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval
4
- """
5
-
6
- import os
7
- import sys
8
- import logging
9
- from dataclasses import dataclass
10
- from typing import List, Dict, Tuple, Optional, Iterator
11
- from functools import lru_cache
12
- from threading import Thread
13
- import warnings
14
-
15
- # Configure logging for Hugging Face Spaces
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s - %(message)s',
19
- handlers=[
20
- logging.StreamHandler(sys.stdout),
21
- logging.FileHandler('/data/app.log')
22
- ]
23
- )
24
- logger = logging.getLogger(__name__)
25
- warnings.filterwarnings("ignore")
26
-
27
- # Import core dependencies
28
- import numpy as np
29
- import torch
30
- from datasets import load_dataset, Dataset
31
- from sentence_transformers import SentenceTransformer
32
- from rank_bm25 import BM25Okapi
33
- from sklearn.cluster import MiniBatchKMeans
34
- import spacy
35
- from transformers import (
36
- AutoTokenizer,
37
- AutoModelForCausalLM,
38
- GenerationConfig,
39
- TextIteratorStreamer,
40
- BitsAndBytesConfig,
41
- )
42
- import gradio as gr
43
- import pickle
44
- import json
45
-
46
- # Try to import FAISS
47
- try:
48
- import faiss
49
- FAISS_AVAILABLE = True
50
- except ImportError:
51
- FAISS_AVAILABLE = False
52
- logger.warning("FAISS not available, using numpy fallback")
53
-
54
- # Environment setup for Hugging Face Spaces
55
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
56
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
57
-
58
- # Use persistent storage for Hugging Face Spaces
59
- ARTIFACT_DIR = os.environ.get("ARTIFACT_DIR", "/data/artifacts")
60
- os.makedirs(ARTIFACT_DIR, exist_ok=True)
61
-
62
- # Paths for artifacts
63
- LLM_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "llm_model")
64
- EMBED_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "embed_model")
65
- BM25_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "bm25.pkl")
66
- CORPUS_DATA_PATH = os.path.join(ARTIFACT_DIR, "corpus_data.json")
67
- CORPUS_EMBED_PATH = os.path.join(ARTIFACT_DIR, "corpus_embeddings.npy")
68
- ANSWER_EMBED_PATH = os.path.join(ARTIFACT_DIR, "answer_embeddings.npy")
69
- FAISS_INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
70
-
71
- # Device configuration
72
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
73
- if torch.cuda.is_available():
74
- torch.backends.cuda.matmul.allow_tf32 = True
75
- torch.backends.cudnn.benchmark = True
76
- logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
77
- else:
78
- logger.info("Using CPU")
79
-
80
- # Model configuration (EXACT SAME AS BEFORE)
81
- MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
82
- EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
83
- MAX_CORPUS_SIZE = 600
84
-
85
- # ========================
86
- # 1) Dataset & Retrieval (EXACT SAME)
87
- # ========================
88
-
89
- def load_opc_datasets() -> Dict[str, Dataset]:
90
- """Load coding datasets - same function"""
91
- try:
92
- logger.info("Loading OPC datasets...")
93
- ds_instruct = load_dataset("OpenCoder-LLM/opc-sft-stage2", "educational_instruct", split="train")
94
- ds_evol = load_dataset("OpenCoder-LLM/opc-sft-stage2", "evol_instruct", split="train")
95
- return {"educational_instruct": ds_instruct, "evol_instruct": ds_evol}
96
- except Exception as e:
97
- logger.warning(f"OPC failed ({e}), falling back to python_code_instructions...")
98
- ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
99
- return {"python_code": ds}
100
-
101
- def convo_to_io(example: Dict) -> Tuple[str, str]:
102
- """Convert conversation to input/output - same function"""
103
- if "messages" in example:
104
- msgs = example["messages"]
105
- elif "conversations" in example:
106
- msgs = example["conversations"]
107
- else:
108
- instr = example.get("instruction") or example.get("prompt") or ""
109
- inp = example.get("input") or ""
110
- out = example.get("output") or example.get("response") or ""
111
- return (instr + "\n" + inp).strip(), out
112
-
113
- user_text, assistant_text = "", ""
114
- for i, m in enumerate(msgs):
115
- role = (m.get("role") or m.get("from") or "").lower()
116
- content = m.get("content") or m.get("value") or ""
117
- if role in ("user", "human") and not user_text:
118
- user_text = content
119
- if role in ("assistant", "gpt") and user_text:
120
- assistant_text = content
121
- break
122
- return user_text.strip(), assistant_text.strip()
123
-
124
- @dataclass
125
- class RetrievalSystem:
126
- """Retrieval system dataclass - same structure"""
127
- embed_model: SentenceTransformer
128
- bm25: BM25Okapi
129
- corpus_texts: List[str]
130
- corpus_answers: List[str]
131
- corpus_embeddings: np.ndarray
132
- answer_embeddings: np.ndarray
133
- corpus_meta: List[Dict]
134
- nlp: spacy.language.Language
135
- faiss_index: Optional[any] = None
136
-
137
- def build_retrieval_system(ds_map: Dict[str, Dataset]) -> RetrievalSystem:
138
- """Build retrieval system - EXACT SAME IMPLEMENTATION"""
139
- # Try to load from artifacts first
140
- required = [EMBED_ARTIFACT_PATH, BM25_ARTIFACT_PATH, CORPUS_DATA_PATH, CORPUS_EMBED_PATH, ANSWER_EMBED_PATH]
141
- if FAISS_AVAILABLE:
142
- required.append(FAISS_INDEX_PATH)
143
-
144
- if all(os.path.exists(p) for p in required):
145
- logger.info("Loading retrieval system from artifacts...")
146
- embed_model = SentenceTransformer(EMBED_ARTIFACT_PATH, device=str(DEVICE))
147
- with open(BM25_ARTIFACT_PATH, "rb") as f:
148
- bm25 = pickle.load(f)
149
- with open(CORPUS_DATA_PATH, "r", encoding="utf-8") as f:
150
- data = json.load(f)
151
- corpus_embeddings = np.load(CORPUS_EMBED_PATH)
152
- answer_embeddings = np.load(ANSWER_EMBED_PATH)
153
- faiss_index = faiss.read_index(FAISS_INDEX_PATH) if FAISS_AVAILABLE and os.path.exists(FAISS_INDEX_PATH) else None
154
- nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
155
- return RetrievalSystem(
156
- embed_model=embed_model, bm25=bm25,
157
- corpus_texts=data["texts"], corpus_answers=data["answers"],
158
- corpus_embeddings=corpus_embeddings, answer_embeddings=answer_embeddings,
159
- corpus_meta=data["meta"], nlp=nlp, faiss_index=faiss_index
160
- )
161
-
162
- # Build from scratch (same implementation)
163
- logger.info("Building retrieval system with answer-space support...")
164
- all_questions, all_answers, all_metas = [], [], []
165
- for name, ds in ds_map.items():
166
- for ex in ds.select(range(min(len(ds), 1500))):
167
- q, a = convo_to_io(ex)
168
- if q and a and 50 < len(a) < 2000:
169
- all_questions.append(q)
170
- all_answers.append(a)
171
- all_metas.append({"intent": name, "answer": a})
172
-
173
- embed_model = SentenceTransformer(EMBED_MODEL, device=str(DEVICE))
174
- question_embeddings = embed_model.encode(all_questions, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
175
- answer_embeddings = embed_model.encode(all_answers, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
176
-
177
- # Clustering to reduce size (same)
178
- if len(all_questions) > MAX_CORPUS_SIZE:
179
- kmeans = MiniBatchKMeans(n_clusters=MAX_CORPUS_SIZE, random_state=42, batch_size=1000)
180
- labels = kmeans.fit_predict(answer_embeddings)
181
- selected = []
182
- for i in range(MAX_CORPUS_SIZE):
183
- mask = labels == i
184
- if mask.any():
185
- idx = np.where(mask)[0]
186
- dists = np.linalg.norm(answer_embeddings[idx] - kmeans.cluster_centers_[i], axis=1)
187
- selected.append(idx[np.argmin(dists)])
188
- idxs = selected
189
- else:
190
- idxs = list(range(len(all_questions)))
191
-
192
- texts = [all_questions[i] for i in idxs]
193
- answers = [all_answers[i] for i in idxs]
194
- metas = [all_metas[i] for i in idxs]
195
- q_embs = question_embeddings[idxs]
196
- a_embs = answer_embeddings[idxs]
197
-
198
- tokenized = [t.lower().split() for t in texts]
199
- bm25 = BM25Okapi(tokenized)
200
-
201
- faiss_index = None
202
- if FAISS_AVAILABLE:
203
- faiss_index = faiss.IndexFlatIP(a_embs.shape[1])
204
- faiss_index.add(a_embs.astype('float32'))
205
-
206
- # Save everything
207
- embed_model.save(EMBED_ARTIFACT_PATH)
208
- with open(BM25_ARTIFACT_PATH, "wb") as f:
209
- pickle.dump(bm25, f)
210
- with open(CORPUS_DATA_PATH, "w", encoding="utf-8") as f:
211
- json.dump({"texts": texts, "answers": answers, "meta": metas}, f)
212
- np.save(CORPUS_EMBED_PATH, q_embs)
213
- np.save(ANSWER_EMBED_PATH, a_embs)
214
- if faiss_index:
215
- faiss.write_index(faiss_index, FAISS_INDEX_PATH)
216
-
217
- nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
218
- return RetrievalSystem(
219
- embed_model=embed_model, bm25=bm25, corpus_texts=texts, corpus_answers=answers,
220
- corpus_embeddings=q_embs, answer_embeddings=a_embs, corpus_meta=metas,
221
- nlp=nlp, faiss_index=faiss_index
222
- )
223
-
224
- # ========================
225
- # 2) Generative Core (EXACT SAME)
226
- # ========================
227
-
228
- @dataclass
229
- class GenerativeCore:
230
- """Generative core dataclass - same structure"""
231
- model: AutoModelForCausalLM
232
- tokenizer: AutoTokenizer
233
- generation_config: GenerationConfig
234
-
235
- def build_generative_core():
236
- """Build generative core - EXACT SAME IMPLEMENTATION"""
237
- # Always download fresh from HuggingFace for reliability
238
- print("Downloading TinyLlama with 4-bit quantization...")
239
-
240
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
241
- if tokenizer.pad_token is None:
242
- tokenizer.pad_token = tokenizer.eos_token
243
-
244
- tokenizer.chat_template = (
245
- "{% for message in messages %}"
246
- "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
247
- "{% endfor %}"
248
- "{% if add_generation_prompt %}"
249
- "<|assistant|>\n"
250
- "{% endif %}"
251
- )
252
-
253
- quantization_config = None
254
- if torch.cuda.is_available():
255
- quantization_config = BitsAndBytesConfig(
256
- load_in_4bit=True,
257
- bnb_4bit_compute_dtype=torch.float32,
258
- bnb_4bit_use_double_quant=True,
259
- bnb_4bit_quant_type="nf4"
260
- )
261
-
262
- model = AutoModelForCausalLM.from_pretrained(
263
- MODEL_NAME,
264
- quantization_config=quantization_config,
265
- device_map="auto" if torch.cuda.is_available() else None,
266
- low_cpu_mem_usage=True
267
- )
268
- model.eval()
269
-
270
- gen_cfg = GenerationConfig(
271
- max_new_tokens=300,
272
- temperature=0.7,
273
- top_p=0.9,
274
- do_sample=True,
275
- repetition_penalty=1.15,
276
- pad_token_id=tokenizer.pad_token_id
277
- )
278
-
279
- # Save for future use (optional)
280
- if not os.path.exists(LLM_ARTIFACT_PATH):
281
- os.makedirs(LLM_ARTIFACT_PATH, exist_ok=True)
282
- tokenizer.save_pretrained(LLM_ARTIFACT_PATH)
283
- gen_cfg.save_pretrained(LLM_ARTIFACT_PATH)
284
-
285
- return GenerativeCore(model, tokenizer, gen_cfg)
286
-
287
- # ========================
288
- # 3) SOTA Enhanced Retrieval (EXACT SAME)
289
- # ========================
290
-
291
- class HybridCodeAssistant:
292
- """Main assistant class - EXACT SAME IMPLEMENTATION"""
293
- def __init__(self):
294
- self.retrieval = build_retrieval_system(load_opc_datasets())
295
- self.generator = build_generative_core()
296
- logger.info("Codey Bryant 3.0 ready with HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval!")
297
-
298
- def generate_hyde(self, query: str) -> str:
299
- """Generate HyDE - same implementation"""
300
- prompt = f"""Write a concise, direct Python code example or explanation that answers this question.
301
- Only output the answer, no extra text.
302
-
303
- Question: {query}
304
-
305
- Answer:"""
306
- inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
307
- with torch.no_grad():
308
- out = self.generator.model.generate(**inputs, max_new_tokens=128, temperature=0.3, do_sample=True)
309
- return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Answer:")[-1].strip()
310
-
311
- def rewrite_query(self, query: str) -> str:
312
- """Rewrite query - same implementation"""
313
- prompt = f"""Rewrite this vague or casual programming question into a clear, specific one for better code retrieval.
314
-
315
- Original: {query}
316
-
317
- Improved:"""
318
- inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
319
- with torch.no_grad():
320
- out = self.generator.model.generate(**inputs, max_new_tokens=64, temperature=0.1)
321
- return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Improved:")[-1].strip()
322
-
323
- def retrieve_enhanced(self, query: str, k: int = 3) -> List[Tuple[str, Dict, float]]:
324
- """Enhanced retrieval - EXACT SAME IMPLEMENTATION"""
325
- # Use list of tuples instead of set to avoid hashability issues with dicts
326
- results = []
327
-
328
- def add_results(q_text: str, weight: float = 1.0):
329
- try:
330
- # Determine embedding space (answer for HyDE/long texts, question otherwise)
331
- use_answer_space = "HyDE" in q_text or len(q_text.split()) > 20
332
- target_embs = self.retrieval.answer_embeddings if use_answer_space else self.retrieval.corpus_embeddings
333
-
334
- # Encode query
335
- q_emb = self.retrieval.embed_model.encode(q_text, normalize_embeddings=True)
336
-
337
- if self.retrieval.faiss_index is not None and use_answer_space:
338
- # FAISS on answer space
339
- query_vec = q_emb.astype('float32').reshape(1, -1)
340
- scores_top, indices_top = self.retrieval.faiss_index.search(query_vec, min(k * 3, len(self.retrieval.corpus_texts)))
341
- scores = scores_top[0]
342
- idxs = indices_top[0]
343
- else:
344
- # Numpy fallback or question space
345
- scores = np.dot(target_embs, q_emb)
346
- idxs = np.argsort(-scores)[:k*3]
347
-
348
- # Add BM25 if not answer space
349
- if not use_answer_space:
350
- tokenized_query = q_text.lower().split()
351
- bm25_scores = self.retrieval.bm25.get_scores(tokenized_query)
352
- if bm25_scores.max() > 0:
353
- bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
354
- else:
355
- bm25_scores = np.zeros_like(bm25_scores)
356
- scores = 0.3 * bm25_scores + 0.7 * scores # Hybrid
357
-
358
- # Collect candidates (avoid duplicates by checking text)
359
- seen_texts = set()
360
- for score, idx in zip(scores, idxs):
361
- if score > 0.15 and idx < len(self.retrieval.corpus_texts):
362
- text = self.retrieval.corpus_texts[idx]
363
- if text not in seen_texts:
364
- seen_texts.add(text)
365
- results.append((text, self.retrieval.corpus_meta[idx], float(score * weight)))
366
- except Exception as e:
367
- logger.error(f"add_results failed for '{q_text}': {e}")
368
-
369
- # 1. Original query
370
- add_results(query, weight=1.0)
371
-
372
- # 2. Rewritten query
373
- try:
374
- rw = self.rewrite_query(query)
375
- if len(rw) > 8 and rw != query:
376
- add_results(rw, weight=1.2)
377
- except Exception as e:
378
- logger.warning(f"Rewrite failed: {e}")
379
-
380
- # 3. HyDE (strong weight in answer space!)
381
- try:
382
- hyde = self.generate_hyde(query)
383
- if len(hyde) > 20:
384
- add_results(hyde, weight=1.5) # Note: No " HyDE" suffix needed now
385
- except Exception as e:
386
- logger.warning(f"HyDE failed: {e}")
387
-
388
- # 4. Multi-query variants (lighter weight)
389
- variants = [
390
- f"Python code for: {query}",
391
- f"Fix error: {query}",
392
- f"Explain in Python: {query}",
393
- f"Best way to {query} in Python",
394
- ]
395
- for v in variants:
396
- add_results(v, weight=0.8)
397
-
398
- # Rerank by similarity to original (no set needed)
399
- if not results:
400
- return []
401
-
402
- q_emb = self.retrieval.embed_model.encode(query, normalize_embeddings=True)
403
- final = []
404
- for text, meta, score in results:
405
- text_emb = self.retrieval.embed_model.encode(text, normalize_embeddings=True)
406
- sim = float(np.dot(q_emb, text_emb))
407
- final.append((text, meta, score + 0.3 * sim))
408
-
409
- final.sort(key=lambda x: x[2], reverse=True)
410
- return final[:k]
411
-
412
- def answer_stream(self, text: str) -> Iterator[str]:
413
- """Stream answer - same implementation"""
414
- retrieved = self.retrieve_enhanced(text, k=3)
415
-
416
- context = ""
417
- if retrieved and retrieved[0][2] > 0.3:
418
- q, meta, _ = retrieved[0]
419
- ans = meta["answer"][:200]
420
- context = f"Reference example:\nQ: {q}\nA: {ans}\n\n"
421
-
422
- messages = [
423
- {"role": "system", "content": "You are a concise, accurate Python coding assistant. Use the reference if helpful." + context},
424
- {"role": "user", "content": text}
425
- ]
426
-
427
- prompt = self.generator.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
428
- inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
429
-
430
- streamer = TextIteratorStreamer(self.generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
431
- thread = Thread(target=self.generator.model.generate, kwargs=dict(
432
- **inputs, streamer=streamer, generation_config=self.generator.generation_config
433
- ))
434
- thread.start()
435
-
436
- for token in streamer:
437
- yield token
438
- thread.join()
439
-
440
- # ========================
441
- # 4) Gradio UI (Optimized for Hugging Face)
442
- # ========================
443
-
444
- ASSISTANT: Optional[HybridCodeAssistant] = None
445
-
446
- def initialize_assistant():
447
- """Initialize assistant with progress tracking"""
448
- global ASSISTANT
449
- if ASSISTANT is None:
450
- yield "Initializing Codey Bryant 3.0..."
451
- yield "Loading retrieval system..."
452
- ASSISTANT = HybridCodeAssistant()
453
- yield "Codey Bryant 3.0 Ready!"
454
- yield "SOTA RAG Features: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval"
455
- yield "Ask coding questions like: 'it's not working', 'help with error', 'make it faster'"
456
- else:
457
- yield "Assistant already initialized!"
458
-
459
- def chat(message: str, history: list):
460
- """Chat function with error handling"""
461
- if ASSISTANT is None:
462
- yield "Please click 'Initialize Assistant' first!"
463
- return
464
-
465
- # Append user message
466
- history.append([message, ""])
467
- yield history
468
-
469
- # Stream response
470
- try:
471
- response = ""
472
- for token in ASSISTANT.answer_stream(message):
473
- response += token
474
- history[-1][1] = response
475
- yield history
476
- except Exception as e:
477
- logger.error(f"Chat error: {e}")
478
- history[-1][1] = f"Error: {str(e)}"
479
- yield history
480
-
481
- def create_ui():
482
- """Create Gradio UI optimized for Hugging Face"""
483
- with gr.Blocks(
484
- title="Codey Bryant 3.0 - SOTA RAG Coding Assistant",
485
- theme=gr.themes.Soft(),
486
- css="""
487
- .gradio-container { max-width: 1200px; margin: auto; }
488
- .chatbot { min-height: 500px; }
489
- .status-box { padding: 20px; border-radius: 10px; background: #f0f8ff; }
490
- """
491
- ) as demo:
492
- gr.Markdown("""
493
- # 🤖 Codey Bryant 3.0
494
- ## **SOTA RAG Coding Assistant**
495
-
496
- ### **Advanced Features:**
497
- - **HyDE** (Hypothetical Document Embeddings)
498
- - **Query Rewriting** for vague queries
499
- - **Multi-Query** retrieval
500
- - **Answer-Space Retrieval**
501
-
502
- ### **Handles vague questions like:**
503
- - "it's not working"
504
- - "help with error"
505
- - "make it faster"
506
- - "why error"
507
- - "how to implement"
508
-
509
- ### **Powered by:**
510
- - TinyLlama 1.1B (4-bit quantized)
511
- - Hybrid retrieval (FAISS + BM25)
512
- - OPC coding datasets
513
- """)
514
-
515
- with gr.Row():
516
- with gr.Column(scale=1):
517
- init_btn = gr.Button(
518
- "Initialize Assistant",
519
- variant="primary",
520
- size="lg"
521
- )
522
- clear_btn = gr.Button("Clear Chat", size="lg")
523
-
524
- with gr.Column(scale=4):
525
- status = gr.Markdown(
526
- "### Status: Click 'Initialize Assistant' to start",
527
- elem_classes="status-box"
528
- )
529
-
530
- chatbot = gr.Chatbot(
531
- label="Chat with Codey",
532
- height=500,
533
- show_label=True,
534
- avatar_images=(None, "🤖"),
535
- bubble_full_width=False
536
- )
537
-
538
- with gr.Row():
539
- msg = gr.Textbox(
540
- placeholder="Ask anything about Python coding...",
541
- label="Your Question",
542
- lines=3,
543
- scale=5,
544
- container=False
545
- )
546
- submit_btn = gr.Button("Send", variant="secondary", scale=1)
547
-
548
- # Examples
549
- gr.Examples(
550
- examples=[
551
- ["How to read a CSV file in Python?"],
552
- ["Why am I getting 'list index out of range' error?"],
553
- ["Make this function faster..."],
554
- ["Help, my code isn't working!"],
555
- ["Best way to sort a dictionary by value?"]
556
- ],
557
- inputs=msg,
558
- label="Try these examples:"
559
- )
560
-
561
- # Event handlers
562
- init_btn.click(
563
- initialize_assistant,
564
- outputs=status
565
- )
566
-
567
- def submit_message(message, history):
568
- return "", history + [[message, None]]
569
-
570
- msg.submit(
571
- submit_message,
572
- [msg, chatbot],
573
- [msg, chatbot],
574
- queue=False
575
- ).then(
576
- chat,
577
- [msg, chatbot],
578
- chatbot
579
- )
580
-
581
- submit_btn.click(
582
- submit_message,
583
- [msg, chatbot],
584
- [msg, chatbot],
585
- queue=False
586
- ).then(
587
- chat,
588
- [msg, chatbot],
589
- chatbot
590
- )
591
-
592
- clear_btn.click(lambda: None, None, chatbot, queue=False)
593
-
594
- # Footer
595
- gr.Markdown("""
596
- ---
597
- *Codey Bryant 3.0 uses TinyLlama 1.1B with 4-bit quantization. Responses may take a few seconds.*
598
- """)
599
-
600
- return demo
601
-
602
- # ========================
603
- # 5) Main Entry Point
604
- # ========================
605
-
606
- if __name__ == "__main__":
607
- # Check if we're in a Hugging Face Space
608
- import os
609
-
610
- # Get environment variables for Hugging Face
611
- server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
612
- server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
613
- share = os.environ.get("GRADIO_SHARE", "False").lower() == "true"
614
-
615
- # Create and launch the app
616
- with gr.Blocks(title="Codey Bryant 3.0", theme=gr.themes.Soft()) as demo:
617
-
618
- # Create and launch the demo
619
- demo = create_ui()
620
-
621
- logger.info(f"Starting Codey Bryant 3.0 on {server_name}:{server_port}")
622
- logger.info("SOTA RAG Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
623
-
624
- demo.launch(
625
- server_name=server_name,
626
- server_port=server_port,
627
- share=False, # Set to True if you want a public link
628
- debug=False,
629
- show_error=True
630
- )
 
1
+ """
2
+ Codey Bryant 3.0 — SOTA RAG for Hugging Face Spaces
3
+ Maintains EXACT same architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import List, Dict, Tuple, Optional, Iterator
11
+ from functools import lru_cache
12
+ from threading import Thread
13
+ import warnings
14
+
15
+ # Configure logging for Hugging Face Spaces
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.StreamHandler(sys.stdout),
21
+ logging.FileHandler('/data/app.log')
22
+ ]
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+ warnings.filterwarnings("ignore")
26
+
27
+ # Import core dependencies
28
+ import numpy as np
29
+ import torch
30
+ from datasets import load_dataset, Dataset
31
+ from sentence_transformers import SentenceTransformer
32
+ from rank_bm25 import BM25Okapi
33
+ from sklearn.cluster import MiniBatchKMeans
34
+ import spacy
35
+ from transformers import (
36
+ AutoTokenizer,
37
+ AutoModelForCausalLM,
38
+ GenerationConfig,
39
+ TextIteratorStreamer,
40
+ BitsAndBytesConfig,
41
+ )
42
+ import gradio as gr
43
+ import pickle
44
+ import json
45
+
46
+ # Try to import FAISS
47
+ try:
48
+ import faiss
49
+ FAISS_AVAILABLE = True
50
+ except ImportError:
51
+ FAISS_AVAILABLE = False
52
+ logger.warning("FAISS not available, using numpy fallback")
53
+
54
+ # Environment setup for Hugging Face Spaces
55
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
56
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
57
+
58
+ # Use persistent storage for Hugging Face Spaces
59
+ ARTIFACT_DIR = os.environ.get("ARTIFACT_DIR", "/data/artifacts")
60
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
61
+
62
+ # Paths for artifacts
63
+ LLM_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "llm_model")
64
+ EMBED_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "embed_model")
65
+ BM25_ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, "bm25.pkl")
66
+ CORPUS_DATA_PATH = os.path.join(ARTIFACT_DIR, "corpus_data.json")
67
+ CORPUS_EMBED_PATH = os.path.join(ARTIFACT_DIR, "corpus_embeddings.npy")
68
+ ANSWER_EMBED_PATH = os.path.join(ARTIFACT_DIR, "answer_embeddings.npy")
69
+ FAISS_INDEX_PATH = os.path.join(ARTIFACT_DIR, "faiss_index.bin")
70
+
71
+ # Device configuration
72
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
73
+ if torch.cuda.is_available():
74
+ torch.backends.cuda.matmul.allow_tf32 = True
75
+ torch.backends.cudnn.benchmark = True
76
+ logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
77
+ else:
78
+ logger.info("Using CPU")
79
+
80
+ # Model configuration (EXACT SAME AS BEFORE)
81
+ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
82
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
83
+ MAX_CORPUS_SIZE = 600
84
+
85
+ # ========================
86
+ # 1) Dataset & Retrieval (EXACT SAME)
87
+ # ========================
88
+
89
+ def load_opc_datasets() -> Dict[str, Dataset]:
90
+ """Load coding datasets - same function"""
91
+ try:
92
+ logger.info("Loading OPC datasets...")
93
+ ds_instruct = load_dataset("OpenCoder-LLM/opc-sft-stage2", "educational_instruct", split="train")
94
+ ds_evol = load_dataset("OpenCoder-LLM/opc-sft-stage2", "evol_instruct", split="train")
95
+ return {"educational_instruct": ds_instruct, "evol_instruct": ds_evol}
96
+ except Exception as e:
97
+ logger.warning(f"OPC failed ({e}), falling back to python_code_instructions...")
98
+ ds = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
99
+ return {"python_code": ds}
100
+
101
+ def convo_to_io(example: Dict) -> Tuple[str, str]:
102
+ """Convert conversation to input/output - same function"""
103
+ if "messages" in example:
104
+ msgs = example["messages"]
105
+ elif "conversations" in example:
106
+ msgs = example["conversations"]
107
+ else:
108
+ instr = example.get("instruction") or example.get("prompt") or ""
109
+ inp = example.get("input") or ""
110
+ out = example.get("output") or example.get("response") or ""
111
+ return (instr + "\n" + inp).strip(), out
112
+
113
+ user_text, assistant_text = "", ""
114
+ for i, m in enumerate(msgs):
115
+ role = (m.get("role") or m.get("from") or "").lower()
116
+ content = m.get("content") or m.get("value") or ""
117
+ if role in ("user", "human") and not user_text:
118
+ user_text = content
119
+ if role in ("assistant", "gpt") and user_text:
120
+ assistant_text = content
121
+ break
122
+ return user_text.strip(), assistant_text.strip()
123
+
124
+ @dataclass
125
+ class RetrievalSystem:
126
+ """Retrieval system dataclass - same structure"""
127
+ embed_model: SentenceTransformer
128
+ bm25: BM25Okapi
129
+ corpus_texts: List[str]
130
+ corpus_answers: List[str]
131
+ corpus_embeddings: np.ndarray
132
+ answer_embeddings: np.ndarray
133
+ corpus_meta: List[Dict]
134
+ nlp: spacy.language.Language
135
+ faiss_index: Optional[any] = None
136
+
137
+ def build_retrieval_system(ds_map: Dict[str, Dataset]) -> RetrievalSystem:
138
+ """Build retrieval system - EXACT SAME IMPLEMENTATION"""
139
+ # Try to load from artifacts first
140
+ required = [EMBED_ARTIFACT_PATH, BM25_ARTIFACT_PATH, CORPUS_DATA_PATH, CORPUS_EMBED_PATH, ANSWER_EMBED_PATH]
141
+ if FAISS_AVAILABLE:
142
+ required.append(FAISS_INDEX_PATH)
143
+
144
+ if all(os.path.exists(p) for p in required):
145
+ logger.info("Loading retrieval system from artifacts...")
146
+ embed_model = SentenceTransformer(EMBED_ARTIFACT_PATH, device=str(DEVICE))
147
+ with open(BM25_ARTIFACT_PATH, "rb") as f:
148
+ bm25 = pickle.load(f)
149
+ with open(CORPUS_DATA_PATH, "r", encoding="utf-8") as f:
150
+ data = json.load(f)
151
+ corpus_embeddings = np.load(CORPUS_EMBED_PATH)
152
+ answer_embeddings = np.load(ANSWER_EMBED_PATH)
153
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH) if FAISS_AVAILABLE and os.path.exists(FAISS_INDEX_PATH) else None
154
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
155
+ return RetrievalSystem(
156
+ embed_model=embed_model, bm25=bm25,
157
+ corpus_texts=data["texts"], corpus_answers=data["answers"],
158
+ corpus_embeddings=corpus_embeddings, answer_embeddings=answer_embeddings,
159
+ corpus_meta=data["meta"], nlp=nlp, faiss_index=faiss_index
160
+ )
161
+
162
+ # Build from scratch (same implementation)
163
+ logger.info("Building retrieval system with answer-space support...")
164
+ all_questions, all_answers, all_metas = [], [], []
165
+ for name, ds in ds_map.items():
166
+ for ex in ds.select(range(min(len(ds), 1500))):
167
+ q, a = convo_to_io(ex)
168
+ if q and a and 50 < len(a) < 2000:
169
+ all_questions.append(q)
170
+ all_answers.append(a)
171
+ all_metas.append({"intent": name, "answer": a})
172
+
173
+ embed_model = SentenceTransformer(EMBED_MODEL, device=str(DEVICE))
174
+ question_embeddings = embed_model.encode(all_questions, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
175
+ answer_embeddings = embed_model.encode(all_answers, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
176
+
177
+ # Clustering to reduce size (same)
178
+ if len(all_questions) > MAX_CORPUS_SIZE:
179
+ kmeans = MiniBatchKMeans(n_clusters=MAX_CORPUS_SIZE, random_state=42, batch_size=1000)
180
+ labels = kmeans.fit_predict(answer_embeddings)
181
+ selected = []
182
+ for i in range(MAX_CORPUS_SIZE):
183
+ mask = labels == i
184
+ if mask.any():
185
+ idx = np.where(mask)[0]
186
+ dists = np.linalg.norm(answer_embeddings[idx] - kmeans.cluster_centers_[i], axis=1)
187
+ selected.append(idx[np.argmin(dists)])
188
+ idxs = selected
189
+ else:
190
+ idxs = list(range(len(all_questions)))
191
+
192
+ texts = [all_questions[i] for i in idxs]
193
+ answers = [all_answers[i] for i in idxs]
194
+ metas = [all_metas[i] for i in idxs]
195
+ q_embs = question_embeddings[idxs]
196
+ a_embs = answer_embeddings[idxs]
197
+
198
+ tokenized = [t.lower().split() for t in texts]
199
+ bm25 = BM25Okapi(tokenized)
200
+
201
+ faiss_index = None
202
+ if FAISS_AVAILABLE:
203
+ faiss_index = faiss.IndexFlatIP(a_embs.shape[1])
204
+ faiss_index.add(a_embs.astype('float32'))
205
+
206
+ # Save everything
207
+ embed_model.save(EMBED_ARTIFACT_PATH)
208
+ with open(BM25_ARTIFACT_PATH, "wb") as f:
209
+ pickle.dump(bm25, f)
210
+ with open(CORPUS_DATA_PATH, "w", encoding="utf-8") as f:
211
+ json.dump({"texts": texts, "answers": answers, "meta": metas}, f)
212
+ np.save(CORPUS_EMBED_PATH, q_embs)
213
+ np.save(ANSWER_EMBED_PATH, a_embs)
214
+ if faiss_index:
215
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
216
+
217
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
218
+ return RetrievalSystem(
219
+ embed_model=embed_model, bm25=bm25, corpus_texts=texts, corpus_answers=answers,
220
+ corpus_embeddings=q_embs, answer_embeddings=a_embs, corpus_meta=metas,
221
+ nlp=nlp, faiss_index=faiss_index
222
+ )
223
+
224
+ # ========================
225
+ # 2) Generative Core (EXACT SAME)
226
+ # ========================
227
+
228
+ @dataclass
229
+ class GenerativeCore:
230
+ """Generative core dataclass - same structure"""
231
+ model: AutoModelForCausalLM
232
+ tokenizer: AutoTokenizer
233
+ generation_config: GenerationConfig
234
+
235
+ def build_generative_core():
236
+ """Build generative core - EXACT SAME IMPLEMENTATION"""
237
+ # Always download fresh from HuggingFace for reliability
238
+ print("Downloading TinyLlama with 4-bit quantization...")
239
+
240
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
241
+ if tokenizer.pad_token is None:
242
+ tokenizer.pad_token = tokenizer.eos_token
243
+
244
+ tokenizer.chat_template = (
245
+ "{% for message in messages %}"
246
+ "{{'<|'+message['role']+'|>\\n'+message['content']+'</s>\\n'}}"
247
+ "{% endfor %}"
248
+ "{% if add_generation_prompt %}"
249
+ "<|assistant|>\n"
250
+ "{% endif %}"
251
+ )
252
+
253
+ quantization_config = None
254
+ if torch.cuda.is_available():
255
+ quantization_config = BitsAndBytesConfig(
256
+ load_in_4bit=True,
257
+ bnb_4bit_compute_dtype=torch.float32,
258
+ bnb_4bit_use_double_quant=True,
259
+ bnb_4bit_quant_type="nf4"
260
+ )
261
+
262
+ model = AutoModelForCausalLM.from_pretrained(
263
+ MODEL_NAME,
264
+ quantization_config=quantization_config,
265
+ device_map="auto" if torch.cuda.is_available() else None,
266
+ low_cpu_mem_usage=True
267
+ )
268
+ model.eval()
269
+
270
+ gen_cfg = GenerationConfig(
271
+ max_new_tokens=300,
272
+ temperature=0.7,
273
+ top_p=0.9,
274
+ do_sample=True,
275
+ repetition_penalty=1.15,
276
+ pad_token_id=tokenizer.pad_token_id
277
+ )
278
+
279
+ # Save for future use (optional)
280
+ if not os.path.exists(LLM_ARTIFACT_PATH):
281
+ os.makedirs(LLM_ARTIFACT_PATH, exist_ok=True)
282
+ tokenizer.save_pretrained(LLM_ARTIFACT_PATH)
283
+ gen_cfg.save_pretrained(LLM_ARTIFACT_PATH)
284
+
285
+ return GenerativeCore(model, tokenizer, gen_cfg)
286
+
287
+ # ========================
288
+ # 3) SOTA Enhanced Retrieval (EXACT SAME)
289
+ # ========================
290
+
291
+ class HybridCodeAssistant:
292
+ """Main assistant class - EXACT SAME IMPLEMENTATION"""
293
+ def __init__(self):
294
+ self.retrieval = build_retrieval_system(load_opc_datasets())
295
+ self.generator = build_generative_core()
296
+ logger.info("Codey Bryant 3.0 ready with HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval!")
297
+
298
+ def generate_hyde(self, query: str) -> str:
299
+ """Generate HyDE - same implementation"""
300
+ prompt = f"""Write a concise, direct Python code example or explanation that answers this question.
301
+ Only output the answer, no extra text.
302
+
303
+ Question: {query}
304
+
305
+ Answer:"""
306
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
307
+ with torch.no_grad():
308
+ out = self.generator.model.generate(**inputs, max_new_tokens=128, temperature=0.3, do_sample=True)
309
+ return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Answer:")[-1].strip()
310
+
311
+ def rewrite_query(self, query: str) -> str:
312
+ """Rewrite query - same implementation"""
313
+ prompt = f"""Rewrite this vague or casual programming question into a clear, specific one for better code retrieval.
314
+
315
+ Original: {query}
316
+
317
+ Improved:"""
318
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
319
+ with torch.no_grad():
320
+ out = self.generator.model.generate(**inputs, max_new_tokens=64, temperature=0.1)
321
+ return self.generator.tokenizer.decode(out[0], skip_special_tokens=True).split("Improved:")[-1].strip()
322
+
323
+ def retrieve_enhanced(self, query: str, k: int = 3) -> List[Tuple[str, Dict, float]]:
324
+ """Enhanced retrieval - EXACT SAME IMPLEMENTATION"""
325
+ # Use list of tuples instead of set to avoid hashability issues with dicts
326
+ results = []
327
+
328
+ def add_results(q_text: str, weight: float = 1.0):
329
+ try:
330
+ # Determine embedding space (answer for HyDE/long texts, question otherwise)
331
+ use_answer_space = "HyDE" in q_text or len(q_text.split()) > 20
332
+ target_embs = self.retrieval.answer_embeddings if use_answer_space else self.retrieval.corpus_embeddings
333
+
334
+ # Encode query
335
+ q_emb = self.retrieval.embed_model.encode(q_text, normalize_embeddings=True)
336
+
337
+ if self.retrieval.faiss_index is not None and use_answer_space:
338
+ # FAISS on answer space
339
+ query_vec = q_emb.astype('float32').reshape(1, -1)
340
+ scores_top, indices_top = self.retrieval.faiss_index.search(query_vec, min(k * 3, len(self.retrieval.corpus_texts)))
341
+ scores = scores_top[0]
342
+ idxs = indices_top[0]
343
+ else:
344
+ # Numpy fallback or question space
345
+ scores = np.dot(target_embs, q_emb)
346
+ idxs = np.argsort(-scores)[:k*3]
347
+
348
+ # Add BM25 if not answer space
349
+ if not use_answer_space:
350
+ tokenized_query = q_text.lower().split()
351
+ bm25_scores = self.retrieval.bm25.get_scores(tokenized_query)
352
+ if bm25_scores.max() > 0:
353
+ bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
354
+ else:
355
+ bm25_scores = np.zeros_like(bm25_scores)
356
+ scores = 0.3 * bm25_scores + 0.7 * scores # Hybrid
357
+
358
+ # Collect candidates (avoid duplicates by checking text)
359
+ seen_texts = set()
360
+ for score, idx in zip(scores, idxs):
361
+ if score > 0.15 and idx < len(self.retrieval.corpus_texts):
362
+ text = self.retrieval.corpus_texts[idx]
363
+ if text not in seen_texts:
364
+ seen_texts.add(text)
365
+ results.append((text, self.retrieval.corpus_meta[idx], float(score * weight)))
366
+ except Exception as e:
367
+ logger.error(f"add_results failed for '{q_text}': {e}")
368
+
369
+ # 1. Original query
370
+ add_results(query, weight=1.0)
371
+
372
+ # 2. Rewritten query
373
+ try:
374
+ rw = self.rewrite_query(query)
375
+ if len(rw) > 8 and rw != query:
376
+ add_results(rw, weight=1.2)
377
+ except Exception as e:
378
+ logger.warning(f"Rewrite failed: {e}")
379
+
380
+ # 3. HyDE (strong weight in answer space!)
381
+ try:
382
+ hyde = self.generate_hyde(query)
383
+ if len(hyde) > 20:
384
+ add_results(hyde, weight=1.5) # Note: No " HyDE" suffix needed now
385
+ except Exception as e:
386
+ logger.warning(f"HyDE failed: {e}")
387
+
388
+ # 4. Multi-query variants (lighter weight)
389
+ variants = [
390
+ f"Python code for: {query}",
391
+ f"Fix error: {query}",
392
+ f"Explain in Python: {query}",
393
+ f"Best way to {query} in Python",
394
+ ]
395
+ for v in variants:
396
+ add_results(v, weight=0.8)
397
+
398
+ # Rerank by similarity to original (no set needed)
399
+ if not results:
400
+ return []
401
+
402
+ q_emb = self.retrieval.embed_model.encode(query, normalize_embeddings=True)
403
+ final = []
404
+ for text, meta, score in results:
405
+ text_emb = self.retrieval.embed_model.encode(text, normalize_embeddings=True)
406
+ sim = float(np.dot(q_emb, text_emb))
407
+ final.append((text, meta, score + 0.3 * sim))
408
+
409
+ final.sort(key=lambda x: x[2], reverse=True)
410
+ return final[:k]
411
+
412
+ def answer_stream(self, text: str) -> Iterator[str]:
413
+ """Stream answer - same implementation"""
414
+ retrieved = self.retrieve_enhanced(text, k=3)
415
+
416
+ context = ""
417
+ if retrieved and retrieved[0][2] > 0.3:
418
+ q, meta, _ = retrieved[0]
419
+ ans = meta["answer"][:200]
420
+ context = f"Reference example:\nQ: {q}\nA: {ans}\n\n"
421
+
422
+ messages = [
423
+ {"role": "system", "content": "You are a concise, accurate Python coding assistant. Use the reference if helpful." + context},
424
+ {"role": "user", "content": text}
425
+ ]
426
+
427
+ prompt = self.generator.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
428
+ inputs = self.generator.tokenizer(prompt, return_tensors="pt").to(DEVICE)
429
+
430
+ streamer = TextIteratorStreamer(self.generator.tokenizer, skip_prompt=True, skip_special_tokens=True)
431
+ thread = Thread(target=self.generator.model.generate, kwargs=dict(
432
+ **inputs, streamer=streamer, generation_config=self.generator.generation_config
433
+ ))
434
+ thread.start()
435
+
436
+ for token in streamer:
437
+ yield token
438
+ thread.join()
439
+
440
+ # ========================
441
+ # 4) Gradio UI (Optimized for Hugging Face)
442
+ # ========================
443
+
444
+ ASSISTANT: Optional[HybridCodeAssistant] = None
445
+
446
+ def initialize_assistant():
447
+ """Initialize assistant with progress tracking"""
448
+ global ASSISTANT
449
+ if ASSISTANT is None:
450
+ yield "Initializing Codey Bryant 3.0..."
451
+ yield "Loading retrieval system..."
452
+ ASSISTANT = HybridCodeAssistant()
453
+ yield "Codey Bryant 3.0 Ready!"
454
+ yield "SOTA RAG Features: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval"
455
+ yield "Ask coding questions like: 'it's not working', 'help with error', 'make it faster'"
456
+ else:
457
+ yield "Assistant already initialized!"
458
+
459
+ def chat(message: str, history: list):
460
+ """Chat function with error handling"""
461
+ if ASSISTANT is None:
462
+ yield "Please click 'Initialize Assistant' first!"
463
+ return
464
+
465
+ # Append user message
466
+ history.append([message, ""])
467
+ yield history
468
+
469
+ # Stream response
470
+ try:
471
+ response = ""
472
+ for token in ASSISTANT.answer_stream(message):
473
+ response += token
474
+ history[-1][1] = response
475
+ yield history
476
+ except Exception as e:
477
+ logger.error(f"Chat error: {e}")
478
+ history[-1][1] = f"Error: {str(e)}"
479
+ yield history
480
+
481
+ def create_ui():
482
+ """Create Gradio UI optimized for Hugging Face"""
483
+ with gr.Blocks(
484
+ title="Codey Bryant 3.0 - SOTA RAG Coding Assistant",
485
+ theme=gr.themes.Soft(),
486
+ css="""
487
+ .gradio-container { max-width: 1200px; margin: auto; }
488
+ .chatbot { min-height: 500px; }
489
+ .status-box { padding: 20px; border-radius: 10px; background: #f0f8ff; }
490
+ """
491
+ ) as demo:
492
+ gr.Markdown("""
493
+ # 🤖 Codey Bryant 3.0
494
+ ## **SOTA RAG Coding Assistant**
495
+
496
+ ### **Advanced Features:**
497
+ - **HyDE** (Hypothetical Document Embeddings)
498
+ - **Query Rewriting** for vague queries
499
+ - **Multi-Query** retrieval
500
+ - **Answer-Space Retrieval**
501
+
502
+ ### **Handles vague questions like:**
503
+ - "it's not working"
504
+ - "help with error"
505
+ - "make it faster"
506
+ - "why error"
507
+ - "how to implement"
508
+
509
+ ### **Powered by:**
510
+ - TinyLlama 1.1B (4-bit quantized)
511
+ - Hybrid retrieval (FAISS + BM25)
512
+ - OPC coding datasets
513
+ """)
514
+
515
+ with gr.Row():
516
+ with gr.Column(scale=1):
517
+ init_btn = gr.Button(
518
+ "Initialize Assistant",
519
+ variant="primary",
520
+ size="lg"
521
+ )
522
+ clear_btn = gr.Button("Clear Chat", size="lg")
523
+
524
+ with gr.Column(scale=4):
525
+ status = gr.Markdown(
526
+ "### Status: Click 'Initialize Assistant' to start",
527
+ elem_classes="status-box"
528
+ )
529
+
530
+ chatbot = gr.Chatbot(
531
+ label="Chat with Codey",
532
+ height=500,
533
+ show_label=True,
534
+ avatar_images=(None, "🤖"),
535
+ bubble_full_width=False
536
+ )
537
+
538
+ with gr.Row():
539
+ msg = gr.Textbox(
540
+ placeholder="Ask anything about Python coding...",
541
+ label="Your Question",
542
+ lines=3,
543
+ scale=5,
544
+ container=False
545
+ )
546
+ submit_btn = gr.Button("Send", variant="secondary", scale=1)
547
+
548
+ # Examples
549
+ gr.Examples(
550
+ examples=[
551
+ ["How to read a CSV file in Python?"],
552
+ ["Why am I getting 'list index out of range' error?"],
553
+ ["Make this function faster..."],
554
+ ["Help, my code isn't working!"],
555
+ ["Best way to sort a dictionary by value?"]
556
+ ],
557
+ inputs=msg,
558
+ label="Try these examples:"
559
+ )
560
+
561
+ # Event handlers
562
+ init_btn.click(
563
+ initialize_assistant,
564
+ outputs=status
565
+ )
566
+
567
+ def submit_message(message, history):
568
+ return "", history + [[message, None]]
569
+
570
+ msg.submit(
571
+ submit_message,
572
+ [msg, chatbot],
573
+ [msg, chatbot],
574
+ queue=False
575
+ ).then(
576
+ chat,
577
+ [msg, chatbot],
578
+ chatbot
579
+ )
580
+
581
+ submit_btn.click(
582
+ submit_message,
583
+ [msg, chatbot],
584
+ [msg, chatbot],
585
+ queue=False
586
+ ).then(
587
+ chat,
588
+ [msg, chatbot],
589
+ chatbot
590
+ )
591
+
592
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
593
+
594
+ # Footer
595
+ gr.Markdown("""
596
+ ---
597
+ *Codey Bryant 3.0 uses TinyLlama 1.1B with 4-bit quantization. Responses may take a few seconds.*
598
+ """)
599
+
600
+ return demo
601
+
602
+ # ========================
603
+ # 5) Main Entry Point
604
+ # ========================
605
+
606
+ if __name__ == "__main__":
607
+ # Configure for Hugging Face Spaces
608
+ server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
609
+ server_port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
610
+
611
+ # Create and launch the demo
612
+ demo = create_ui()
613
+
614
+ logger.info(f"Starting Codey Bryant 3.0 on {server_name}:{server_port}")
615
+ logger.info("SOTA RAG Architecture: HyDE + Query Rewriting + Multi-Query + Answer-Space Retrieval")
616
+
617
+ demo.launch(
618
+ server_name=server_name,
619
+ server_port=server_port,
620
+ share=False, # Set to True if you want a public link
621
+ debug=False,
622
+ show_error=True
623
+ )
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # Core ML/Deep Learning
2
  torch>=2.0.0
3
  transformers>=4.35.0
@@ -13,7 +15,6 @@ numpy>=1.24.0
13
 
14
  # NLP Processing
15
  spacy>=3.6.0
16
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl
17
 
18
  # Datasets
19
  datasets>=2.14.0
 
1
+ # requirements.txt
2
+
3
  # Core ML/Deep Learning
4
  torch>=2.0.0
5
  transformers>=4.35.0
 
15
 
16
  # NLP Processing
17
  spacy>=3.6.0
 
18
 
19
  # Datasets
20
  datasets>=2.14.0