File size: 7,904 Bytes
23625d3 43a2840 23625d3 d2f824a 23625d3 a75551d d2f824a 23625d3 256d5ba 23625d3 256d5ba 23625d3 256d5ba 23625d3 c168b7b 2127498 23625d3 bacd6d4 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 128ed4e 23625d3 8621629 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
---
license: apache-2.0
datasets:
- BEE-spoke-data/pypi_clean-deduped
source_model: BEE-spoke-data/smol_llama-101M-GQA
language:
- en
tags:
- python
- codegen
- markdown
- smol_llama
metrics:
- accuracy
inference:
parameters:
max_new_tokens: 48
min_new_tokens: 8
num_beams: 3
early_stopping: true
repetition_penalty: 1.1
no_repeat_ngram_size: 6
renormalize_logits: true
widget:
- text: |
def add_numbers(a, b):
return
example_title: Add Numbers Function
- text: |
class Car:
def __init__(self, make, model):
self.make = make
self.model = model
def display_car(self):
example_title: Car Class
- text: |
import pandas as pd
data = {'Name': ['Tom', 'Nick', 'John'], 'Age': [20, 21, 19]}
df = pd.DataFrame(data).convert_dtypes()
# eda
example_title: Pandas DataFrame
- text: |
def factorial(n):
if n == 0:
return 1
else:
example_title: Factorial Function
- text: |
def fibonacci(n):
if n <= 0:
raise ValueError("Incorrect input")
elif n == 1:
return 0
elif n == 2:
return 1
else:
example_title: Fibonacci Function
- text: |
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
# simple plot
example_title: Matplotlib Plot
- text: |
def reverse_string(s:str) -> str:
return
example_title: Reverse String Function
- text: |
def is_palindrome(word:str) -> bool:
return
example_title: Palindrome Function
- text: |
def bubble_sort(lst: list):
n = len(lst)
for i in range(n):
for j in range(0, n-i-1):
example_title: Bubble Sort Function
- text: |
def binary_search(arr, low, high, x):
if high >= low:
mid = (high + low) // 2
if arr[mid] == x:
return mid
elif arr[mid] > x:
example_title: Binary Search Function
---
# smol_llama-101M-GQA: python
<a href="https://colab.research.google.com/gist/pszemraj/91b5a267df95461b46922e6c0212e8f7/beecoder-basic-test-notebook.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
> 400MB of buzz: pure Python programming nectar! ๐ฏ
This model is the general pre-trained checkpoint `BEE-spoke-data/smol_llama-101M-GQA` trained on a deduped version of `pypi` for +1 epoch. Play with the model in [this demo space](https://huggingface.co/spaces/BEE-spoke-data/beecoder-playground).
- Its architecture is the same as the base, with some new Python-related tokens added to vocab prior to training.
- It can generate basic Python code and markdown in README style, but will struggle with harder planning/reasoning tasks
- This is an experiment to test the abilities of smol-sized models in code generation; meaning **both** its capabilities and limitations
Use with care & understand that there may be some bugs ๐ still to be worked out.
## Usage
๐ Be sure to note:
1. The model uses the "slow" llama2 tokenizer. Set use_fast=False when loading the tokenizer.
2. Use transformers library version 4.33.3 due to a known issue in version 4.34.1 (_at time of writing_)
> Which llama2 tokenizer the API widget uses is an age-old mystery, and may cause minor whitespace issues (widget only).
To install the necessary packages and load the model:
```python
# Install necessary packages
# pip install transformers==4.33.3 accelerate sentencepiece
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
"BEE-spoke-data/smol_llama-101M-GQA-python",
use_fast=False,
)
model = AutoModelForCausalLM.from_pretrained(
"BEE-spoke-data/smol_llama-101M-GQA-python",
device_map="auto",
)
# The model can now be used as any other decoder
```
### longer code-gen example
Below is a quick script that can be used as a reference/starting point for writing your own, better one :)
<details>
<summary>๐ฅ Unleash the Power of Code Generation! Click to Reveal the Magic! ๐ฎ</summary>
Are you ready to witness the incredible possibilities of code generation? ๐. Brace yourself for an exceptional journey into the world of artificial intelligence and programming. Observe a script that will change the way you create and finalize code.
This script provides entry to a planet where machines can write code with remarkable precision and imagination.
```python
"""
simple script for testing model(s) designed to generate/complete code
See details/args with the below.
python textgen_inference_code.py --help
"""
import logging
import random
import time
from pathlib import Path
import fire
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
logging.basicConfig(format="%(levelname)s - %(message)s", level=logging.INFO)
class Timer:
"""
Basic timer utility.
"""
def __enter__(self):
self.start_time = time.perf_counter()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.end_time = time.perf_counter()
self.elapsed_time = self.end_time - self.start_time
logging.info(f"Elapsed time: {self.elapsed_time:.4f} seconds")
def load_model(model_name, use_fast=False):
""" util for loading model and tokenizer"""
logging.info(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast)
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="auto", device_map="auto"
)
model = torch.compile(model)
return tokenizer, model
def run_inference(prompt, model, tokenizer, max_new_tokens: int = 256):
"""
run_inference
Args:
prompt (TYPE): Description
model (TYPE): Description
tokenizer (TYPE): Description
max_new_tokens (int, optional): Description
Returns:
TYPE: Description
"""
logging.info(f"Running inference with max_new_tokens={max_new_tokens} ...")
with Timer() as timer:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
min_new_tokens=8,
renormalize_logits=True,
no_repeat_ngram_size=8,
repetition_penalty=1.04,
num_beams=4,
early_stopping=True,
)
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
logging.info(f"Output text:\n\n{text}")
return text
def main(
model_name="BEE-spoke-data/smol_llama-101M-GQA-python",
prompt:str=None,
use_fast=False,
n_tokens: int = 256,
):
"""Summary
Args:
model_name (str, optional): Description
prompt (None, optional): specify the prompt directly (default: random choice from list)
n_tokens (int, optional): max new tokens to generate
"""
logging.info(f"Inference with:\t{model_name}, max_new_tokens:{n_tokens}")
if prompt is None:
prompt_list = [
'''
def print_primes(n: int):
"""
Print all primes between 1 and n
"""''',
"def quantum_analysis(",
"def sanitize_filenames(target_dir:str, recursive:False, extension",
]
prompt = random.SystemRandom().choice(prompt_list)
logging.info(f"Using prompt:\t{prompt}")
tokenizer, model = load_model(model_name, use_fast=use_fast)
run_inference(prompt, model, tokenizer, n_tokens)
if __name__ == "__main__":
fire.Fire(main)
```
Wowoweewa!! It can create some file cleaning utilities.
</details>
---
|