Instructions to use 1bitLLM/bitnet_b1_58-large with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use 1bitLLM/bitnet_b1_58-large with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="1bitLLM/bitnet_b1_58-large")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("1bitLLM/bitnet_b1_58-large") model = AutoModelForCausalLM.from_pretrained("1bitLLM/bitnet_b1_58-large") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use 1bitLLM/bitnet_b1_58-large with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "1bitLLM/bitnet_b1_58-large" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/1bitLLM/bitnet_b1_58-large
- SGLang
How to use 1bitLLM/bitnet_b1_58-large with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "1bitLLM/bitnet_b1_58-large" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "1bitLLM/bitnet_b1_58-large" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "1bitLLM/bitnet_b1_58-large", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use 1bitLLM/bitnet_b1_58-large with Docker Model Runner:
docker model run hf.co/1bitLLM/bitnet_b1_58-large
| import math | |
| import argparse | |
| import torch | |
| import random | |
| from eval_utils import get_test_dataset | |
| from .modeling_bitnet import BitnetForCausalLM | |
| from .tokenization_bitnet import BitnetTokenizer | |
| from tqdm import tqdm | |
| torch.set_grad_enabled(False) | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--seed', default=0, type=int) | |
| parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str) | |
| parser.add_argument('--seqlen', default=2048, type=int) | |
| def calulate_loss(model, input, loss_fct): | |
| output = model(input, | |
| use_cache=False, | |
| output_hidden_states=False, | |
| output_attentions=False)[0] | |
| shift_logits = output[:, :-1, :].contiguous() | |
| shift_labels = input[:, 1:] | |
| loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) | |
| return loss | |
| def main(args): | |
| datasets = ['c4', 'wikitext2'] | |
| model = BitnetForCausalLM.from_pretrained( | |
| args.hf_path, | |
| device_map='auto', | |
| low_cpu_mem_usage=True, | |
| use_flash_attention_2=True, | |
| torch_dtype=torch.float16, | |
| ).half() | |
| tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False) | |
| loss_fct = torch.nn.CrossEntropyLoss(reduction="sum").cuda() | |
| ppl = [] | |
| for dataset in datasets: | |
| testdata = get_test_dataset(dataset, tokenizer, seqlen=args.seqlen) | |
| acc_loss, count = 0.0, 0 | |
| progress = tqdm(range(len(testdata))) | |
| for ii in progress: | |
| input = torch.Tensor(testdata[ii]).long().cuda().view(1, -1) | |
| loss = calulate_loss(model, input, loss_fct) | |
| count += (input.size(-1) - 1) | |
| acc_loss += loss.item() | |
| progress.set_description(f"avg_loss = {acc_loss/ count / math.log(2)}") | |
| avg_loss = acc_loss / count / math.log(2) | |
| ppl.append(2 ** avg_loss) | |
| print("{} PPL: {}".format(dataset, ppl[-1])) | |
| print(ppl) | |
| print("Avg PPL:", sum(ppl) / len(ppl)) | |
| if __name__ == '__main__': | |
| torch.set_grad_enabled(False) | |
| args = parser.parse_args() | |
| random.seed(args.seed) | |
| torch.random.manual_seed(args.seed) | |
| main(args) |