mcysqrd/mojo-coder-1B · Hugging Face

FIM training over deepseek-coder-1.3B using a mojo-code dataset. This is an alpha version. It is trained only for FIM co-pilot style usage. later versions should have Q&A added as well as better performance. please leave your comments to help improve it. the recipe for this was based on this template from https://huggingface.co/blog/personal-copilot

tokenizer = AutoTokenizer.from_pretrained(merged_model_path,trust_remote_code=True,use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
        merged_model_path,
        device_map={"": 0},
        use_cache=True,
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
        torch_dtype=torch.bfloat16
    )

input_text = """<｜fim▁begin｜> 
from algorithm import parallelize, vectorize
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from os import env
from python import Python
from python.object import PythonObject
from runtime.llcl import num_cores, Runtime
from tensor import Tensor
from utils.index import Index


alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()

alias width = 960
alias height = 960
alias MAX_ITERS = 200

alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5

fn mandelbrot_kernel_SIMD[
    simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
    let cx = c.re
    let cy = c.im
    var x = SIMD[float_type, simd_width](0)
    var y = SIMD[float_type, simd_width](0)
    var y2 = SIMD[float_type, simd_width](0)
    var iters = SIMD[float_type, simd_width](0)

    var t: SIMD[DType.bool, simd_width] = True
    for i in range(MAX_ITERS):
        if not t.reduce_or():
            break
        y2 = y*y
        y = x.fma(y + y, cy)
        t = x.fma(x, y2) <= 4
        x = x.fma(x, cx - y2)
        iters = t.select(iters + 1, iters)
    return iters

fn compare():
    let t = Tensor[float_type](height, width)

    @parameter
    fn worker(row: Int):
        let scale_x = (max_x - min_x) / width
        let scale_y = (max_y - min_y) / height
    <｜fim▁hole｜> 
fn main():
    compare()
<｜fim▁end｜>"""

inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=547+200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)[len(input_text):])

def stream(user_prompt):
    runtimeFlag = "cuda:0"
    inputs = tokenizer([user_prompt], return_tensors="pt").to(runtimeFlag)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=200)

stream(input_text)

also try to use an inference endpoint and use a VS-Code extension