Text Generation
Transformers
Safetensors
Chinese
English
zhinao
qihoo360
奇虎360
360Zhinao
pretrain
conversational
custom_code
Instructions to use qihoo360/360Zhinao-7B-Chat-4K with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use qihoo360/360Zhinao-7B-Chat-4K with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="qihoo360/360Zhinao-7B-Chat-4K", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("qihoo360/360Zhinao-7B-Chat-4K", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use qihoo360/360Zhinao-7B-Chat-4K with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "qihoo360/360Zhinao-7B-Chat-4K" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qihoo360/360Zhinao-7B-Chat-4K", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/qihoo360/360Zhinao-7B-Chat-4K
- SGLang
How to use qihoo360/360Zhinao-7B-Chat-4K with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "qihoo360/360Zhinao-7B-Chat-4K" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qihoo360/360Zhinao-7B-Chat-4K", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "qihoo360/360Zhinao-7B-Chat-4K" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qihoo360/360Zhinao-7B-Chat-4K", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use qihoo360/360Zhinao-7B-Chat-4K with Docker Model Runner:
docker model run hf.co/qihoo360/360Zhinao-7B-Chat-4K
| import re | |
| import torch | |
| import numpy as np | |
| from queue import Queue | |
| from typing import Tuple, List, Union, Iterable | |
| from transformers.utils import logging, add_start_docstrings | |
| from transformers.generation.logits_process import LogitsProcessor, LOGITS_PROCESSOR_INPUTS_DOCSTRING, LogitsProcessorList | |
| def make_context(model, tokenizer, | |
| messages: List[dict], | |
| system: str = "You are a helpful assistant.", | |
| max_new_tokens: int=0, | |
| ): | |
| max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens | |
| max_input_length = model.config.model_max_length - max_new_tokens | |
| im_start_id = [tokenizer.im_start_id] | |
| im_end_id = [tokenizer.im_end_id] | |
| nl_tokens = tokenizer.encode("\n") | |
| def _tokenize_str(role, content): | |
| return tokenizer.encode(role, allowed_special=set()) + nl_tokens + tokenizer.encode(content, allowed_special=set()) | |
| def _parse_messages(messages): | |
| system, query, history = "", "", [] | |
| ## system | |
| if messages[0]["role"] == "system": | |
| system = messages[0]["content"] | |
| messages = messages[1:] | |
| ## query | |
| assert messages[-1]["role"] == "user" | |
| query = messages[-1]["content"] | |
| messages = messages[:-1] | |
| ## history | |
| assert len(messages) % 2 == 0 | |
| for i in range(0, len(messages), 2): | |
| assert messages[i]["role"] == "user" and messages[i+1]["role"] == "assistant" | |
| history.append([messages[i]["content"], messages[i+1]["content"]]) | |
| return system, query, history | |
| _system, query, history = _parse_messages(messages) | |
| ## system | |
| system_text = _system if _system != "" else system | |
| system_tokens = [] | |
| if system_text: | |
| system_tokens = im_start_id + _tokenize_str("system", system_text) + im_end_id + nl_tokens | |
| ## query | |
| query_tokens = im_start_id + _tokenize_str("user", query) + im_end_id + nl_tokens | |
| ## final assistant | |
| final_tokens = im_start_id + tokenizer.encode("assistant", allowed_special=set()) + nl_tokens | |
| ## max_history_tokens | |
| max_history_length = max_input_length - len(system_tokens) - len(query_tokens) - len(final_tokens) | |
| ## history | |
| context_tokens = [] | |
| for turn_query, turn_response in reversed(history): | |
| ## query tokens | |
| history_query_tokens = im_start_id + _tokenize_str("user", turn_query) + im_end_id + nl_tokens | |
| ## answer tokens | |
| histroy_response_tokens = im_start_id + _tokenize_str("assistant", turn_response) + im_end_id + nl_tokens | |
| ## this round tokens | |
| next_context_tokens = history_query_tokens + histroy_response_tokens | |
| ## concat | |
| current_context_size = len(next_context_tokens) + len(context_tokens) | |
| if current_context_size < max_history_length: | |
| context_tokens = next_context_tokens + context_tokens | |
| else: | |
| break | |
| input_tokens = system_tokens + context_tokens + query_tokens + final_tokens | |
| return torch.LongTensor([input_tokens]).to(model.device) | |
| def parse_pot_no_stream(inputs): | |
| try: | |
| s = re.findall(r'<<(.*?)>>', inputs, re.DOTALL) | |
| if not s: | |
| #print("err inputs: ", origin_inputs, flush=True) | |
| return inputs | |
| index = 0 | |
| for k in s: | |
| try: | |
| if "func" in k: | |
| var = k.split("=", 1) | |
| try: | |
| var[1] = var[1].strip(" ") | |
| exec(var[1], globals()) | |
| ans = func() | |
| except: | |
| if 'sympy' in var[1]: | |
| var[1] = var[1].replace('res[x]', 'res[0][0]').replace('res[y]', 'res[0][1]') | |
| exec(var[1], globals()) | |
| ans = func() | |
| pass | |
| var_list = [c.strip(" ") for c in var[0].split(",")] | |
| if len(var_list) == 1: | |
| ans = [ans] | |
| for i in range(len(ans)): | |
| try: | |
| ans[i] = float(ans[i]) | |
| if abs(ans[i] - int(ans[i])) < 1e-10: | |
| ans[i] = str(int(ans[i])) | |
| except: | |
| pass | |
| inputs = inputs.replace("<<"+k+">>", "") | |
| for i in range(len(var_list)): | |
| inputs = inputs.replace(var_list[i], str(ans[i])) | |
| index += 1 | |
| for c in range(index, len(s)): | |
| for i in range(len(var_list)): | |
| s[c] = s[c].replace(var_list[i], str(ans[i])) | |
| else: | |
| var = k.replace(" ", "").split("=") | |
| var[1] = var[1].replace("eval", "") | |
| ans = round(eval(var[1]), 10) | |
| ans = float(ans) | |
| if abs(ans - int(ans)) < 1e-10: | |
| ans = str(int(ans)) | |
| inputs = inputs.replace("<<"+k+">>", "").replace(var[0], str(ans)) | |
| index += 1 | |
| for c in range(index, len(s)): | |
| s[c] = s[c].replace(var[0], str(ans)) | |
| except: | |
| return inputs | |
| except Exception as e: | |
| return inputs | |
| return inputs | |
| class TextIterStreamer: | |
| def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False, use_pot=True): | |
| self.tokenizer = tokenizer | |
| self.skip_prompt = skip_prompt | |
| self.skip_special_tokens = skip_special_tokens | |
| self.tokens = [] | |
| self.text_queue = Queue() | |
| self.next_tokens_are_prompt = True | |
| self.use_pot = use_pot | |
| def put(self, value): | |
| if self.skip_prompt and self.next_tokens_are_prompt: | |
| self.next_tokens_are_prompt = False | |
| else: | |
| if len(value.shape) > 1: | |
| value = value[0] | |
| self.tokens.extend(value.tolist()) | |
| tokens_str = self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens, errors='ignore') | |
| if self.use_pot: | |
| tokens_str = parse_pot_no_stream(tokens_str) | |
| self.text_queue.put(tokens_str) | |
| def end(self): | |
| self.text_queue.put(None) | |
| def __iter__(self): | |
| return self | |
| def __next__(self): | |
| value = self.text_queue.get() | |
| if value is None: | |
| raise StopIteration() | |
| else: | |
| return value | |
| class OutputRepetitionPenaltyLogitsProcessor(LogitsProcessor): | |
| r""" | |
| [`OutputLogitsProcessor`] that prevents the repetition of previous tokens through a penalty. This penalty is applied at | |
| most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt. | |
| In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a penalty of around | |
| 1.2 to achieve a good balance between truthful generation and lack of repetition. To penalize and reduce | |
| repetition, use `penalty` values above 1.0, where a higher value penalizes more strongly. To reward and encourage | |
| repetition, use `penalty` values between 0.0 and 1.0, where a lower value rewards more strongly. | |
| Args: | |
| penalty (`float`): | |
| The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated | |
| tokens. Between 0.0 and 1.0 rewards previously generated tokens. | |
| """ | |
| def __init__(self, input_length: int, | |
| presence_penalties: float = 1.0, | |
| frequency_penalties: float = 0, | |
| repetition_penalties: float = 0): | |
| if not (repetition_penalties > 0): | |
| raise ValueError(f"`repetition_penalties` has to be a strictly positive float, but is {repetition_penalties}") | |
| if not ( (frequency_penalties >= -2) and (frequency_penalties <= 2) ): | |
| raise ValueError(f"`frequency_penalties` has to be [-2, 2], but is {frequency_penalties}") | |
| if not ( (presence_penalties >= -2) and (presence_penalties <= 2) ): | |
| raise ValueError(f"`presence_penalties` has to be [-2, 2], but is {presence_penalties}") | |
| self.repetition_penalties = repetition_penalties | |
| self.frequency_penalties = frequency_penalties | |
| self.presence_penalties = presence_penalties | |
| self.input_length = input_length | |
| def _get_bin_counts_and_mask( | |
| self, | |
| tokens: torch.Tensor, | |
| vocab_size: int, | |
| num_seqs: int, | |
| ) -> Tuple[torch.Tensor, torch.Tensor]: | |
| # Compute the bin counts for the tokens. | |
| # vocab_size + 1 for padding. | |
| bin_counts = torch.zeros((num_seqs, vocab_size + 1), | |
| dtype=torch.long, | |
| device=tokens.device) | |
| bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) | |
| bin_counts = bin_counts[:, :vocab_size] | |
| mask = bin_counts > 0 | |
| return bin_counts, mask | |
| def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor: | |
| prompt_tokens_tensor = input_ids[:, :self.input_length+1] | |
| output_tokens_tensor = input_ids[:, self.input_length+1:] | |
| num_seqs, vocab_size = logits.shape | |
| _, prompt_mask = self._get_bin_counts_and_mask( | |
| prompt_tokens_tensor, vocab_size, num_seqs) | |
| output_bin_counts, output_mask = self._get_bin_counts_and_mask( | |
| output_tokens_tensor, vocab_size, num_seqs) | |
| repetition_penalties = torch.Tensor([self.repetition_penalties]).to(logits.device) | |
| frequency_penalties = torch.Tensor([self.frequency_penalties]).to(logits.device) | |
| presence_penalties = torch.Tensor([self.presence_penalties]).to(logits.device) | |
| repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size) | |
| repetition_penalties[~(prompt_mask | output_mask)] = 1.0 | |
| logits = torch.where(logits > 0, logits / repetition_penalties, | |
| logits * repetition_penalties) | |
| # We follow the definition in OpenAI API. | |
| # Refer to https://platform.openai.com/docs/api-reference/parameter-details | |
| logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts | |
| logits -= presence_penalties.unsqueeze_(dim=1) * output_mask | |
| return logits |