Generative AI

PrismML Bonsai 1-Bit LLM Coding Tutorial in CUDA with GGUF, Benchmarking, Chat, JSON, and RAG

section("7 · Q1_0_g128 Quantization — What's Happening Under the Hood")


print(textwrap.dedent("""
╔══════════════════════════════════════════════════════════════╗
║           Bonsai Q1_0_g128 Weight Representation            ║
╠══════════════════════════════════════════════════════════════╣
║  Each weight = 1 bit:  0  →  −scale                         ║
║                        1  →  +scale                         ║
║  Every 128 weights share one FP16 scale factor.             ║
║                                                              ║
║  Effective bits per weight:                                  ║
║    1 bit (sign) + 16/128 bits (shared scale) = 1.125 bpw    ║
║                                                              ║
║  Memory comparison for Bonsai-1.7B:                         ║
║    FP16:            3.44 GB  (1.0×  baseline)               ║
║    Q1_0_g128:       0.24 GB  (14.2× smaller!)               ║
║    MLX 1-bit g128:  0.27 GB  (12.8× smaller)                ║
╚══════════════════════════════════════════════════════════════╝
"""))


print("📐 Python demo of Q1_0_g128 quantization logic:n")
import random
random.seed(42)
GROUP_SIZE   = 128
weights_fp16 = [random.gauss(0, 0.1) for _ in range(GROUP_SIZE)]
scale        = max(abs(w) for w in weights_fp16)
quantized    = [1 if w >= 0 else 0 for w in weights_fp16]
dequantized  = [scale if b == 1 else -scale for b in quantized]
mse          = sum((a - b) ** 2 for a, b in zip(weights_fp16, dequantized)) / GROUP_SIZE


print(f"  FP16 weights (first 8): {[f'{w:.4f}' for w in weights_fp16[:8]]}")
print(f"  1-bit repr  (first 8): {quantized[:8]}")
print(f"  Shared scale:          {scale:.4f}")
print(f"  Dequantized (first 8): {[f'{w:.4f}' for w in dequantized[:8]]}")
print(f"  MSE of reconstruction: {mse:.6f}")
memory_fp16 = GROUP_SIZE * 2
memory_1bit = GROUP_SIZE / 8 + 2
print(f"n  Memory: FP16={memory_fp16}B  vs  Q1_0_g128={memory_1bit:.1f}B  "
     f"({memory_fp16/memory_1bit:.1f}× reduction)")


section("8 · Performance Benchmark — Tokens per Second")


def benchmark(prompt, n_tokens=128, n_runs=3, **kw):
   timings = []
   for i in range(n_runs):
       print(f"   Run {i+1}/{n_runs} …", end=" ", flush=True)
       _, elapsed = infer(prompt, verbose=False, n_predict=n_tokens, **kw)
       tps = n_tokens / elapsed
       timings.append(tps)
       print(f"{tps:.1f} tok/s")
   avg = sum(timings) / len(timings)
   print(f"n  ✅ Average: {avg:.1f} tok/s  (over {n_runs} runs, {n_tokens} tokens each)")
   return avg


print("📊 Benchmarking Bonsai-1.7B on your GPU …")
tps = benchmark(
   "Explain the concept of neural network backpropagation step by step.",
   n_tokens=128, n_runs=3,
)


print("n  Published reference throughputs (from whitepaper):")
print("  ┌──────────────────────┬─────────┬──────────────┐")
print("  │ Platform             │ Backend │ TG128 tok/s  │")
print("  ├──────────────────────┼─────────┼──────────────┤")
print("  │ RTX 4090             │ CUDA    │     674      │")
print("  │ M4 Pro 48 GB         │ Metal   │     250      │")
print(f"  │ Your GPU (measured)  │ CUDA    │  {tps:>7.1f}    │")
print("  └──────────────────────┴─────────┴──────────────┘")


section("9 · Multi-Turn Chat with Context Accumulation")


def chat(user_msg, system="You are a helpful assistant.", history=None, **kw):
   if history is None:
       history = []
   history.append(("user", user_msg))
   full = f"<|im_start|>systemn{system}<|im_end|>n"
   for role, msg in history:
       full += f"<|im_start|>{role}n{msg}<|im_end|>n"
   full += "<|im_start|>assistantn"
   safe = full.replace('"', '\"').replace('n', '\n')
   cmd = (
       f'{LLAMA_CLI} -m "{MODEL_PATH}"'
       f' -p "{safe}" -e'
       f' -n 200 --temp 0.5 --top-p 0.85 --top-k 20'
       f' -ngl 99 -c 4096 --no-display-prompt'
   )
   result = run(cmd, capture=True, check=False)
   reply = result.stdout.strip()
   history.append(("assistant", reply))
   return reply, history


print("🗣  Starting a 3-turn conversation about 1-bit models …n")
history = []
turns = [
   "What is a 1-bit language model?",
   "What are the main trade-offs compared to 4-bit or 8-bit quantization?",
   "How does Bonsai specifically address those trade-offs?",
]
for i, msg in enumerate(turns, 1):
   print(f"👤 Turn {i}: {msg}")
   reply, history = chat(msg, history=history)
   print(f"🤖 Bonsai: {reply}n")
   time.sleep(0.5)


section("10 · Sampling Parameter Exploration")


creative_prompt = "Write a one-sentence description of a futuristic city powered entirely by 1-bit AI."
configs = [
   ("Precise / Focused",  dict(temp=0.1, top_k=10,  top_p=0.70)),
   ("Balanced (default)", dict(temp=0.5, top_k=20,  top_p=0.85)),
   ("Creative / Varied",  dict(temp=0.9, top_k=50,  top_p=0.95)),
   ("High entropy",       dict(temp=1.2, top_k=100, top_p=0.98)),
]


print(f'Prompt: "{creative_prompt}"n')
for label, params in configs:
   out, _ = infer(creative_prompt, verbose=False, n_predict=80, **params)
   print(f"  [{label}]")
   print(f"    temp={params['temp']}, top_k={params['top_k']}, top_p={params['top_p']}")
   print(f"    → {out[:200]}n")

Source link

Related Articles

Leave a Reply

Your email address will not be published. Required fields are marked *

Back to top button