--- datasets: - nuprl/EditPackFT-Multi tags: - code --- # What is this This is a deepseek coder 7b model trained to predict commit messages for a diff. # Languages trained on: ```py LANGS = [ "Python", "Rust", "JavaScript", "Java", "Go", "C++", "C#", "Ruby", "PHP", "TypeScript", "C", "Scala", "Swift", "Kotlin", "Objective-C", "Perl", "Haskell", "Bash", "Sh", "Lua", "R", "Julia", ] ``` # How to prompt: ```python import difflib class NDiff: def __init__(self, s1, s2): self.s1 = s1 self.s2 = s2 self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n")) def __str__(self): return "\n".join([l for l in self.diff if l[0] != "?"]) def str_colored(self): import colored buf = "" for l in self.diff: if l[0] == "?": continue if l[0] == "-": buf += colored.stylize(l, colored.fg("red")) elif l[0] == "+": buf += colored.stylize(l, colored.fg("green")) else: buf += l buf += "\n" return buf def num_removed(self): return len([l for l in self.diff if l[0] == "-"]) def num_added(self): return len([l for l in self.diff if l[0] == "+"]) def __repr__(self): return self.__str__() def format_prompt(old, new): diff_header = "" instr_header = "" diff = str(NDiff(old, new)) return f"{diff_header}\n{diff}\n{instr_header}\n" def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90): prompt = format_prompt(old, new) toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device) outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p) return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs] ``` use the "gen" function with the old and new code # Example: ```py - import datasets - from pathlib import Path from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel from code_editing.humanevalpack import batch_prompts_from_example from code_editing.utils import gunzip_json_write from typing import List, Callable from tqdm import tqdm # NOTE: this is the factory for each model type. to add a new model type, add a new case here # and implement it in models.py. Also, add a new case in the argument parser below. - def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]: + def model_factory( + model_type: str, + quantize=False, + num_gpus=1, + system_supported=True, + ) -> Callable[[str], EditModel]: if model_type == "codellama" or model_type == "deepseek": return CodeLlamaEditModel elif model_type == "starcoder": return StarCoderCommitEditModel elif model_type == "codellama-diff": return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim)) elif model_type == "openai": return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path))) elif model_type == "codellama-chat": - return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus))) + return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported))) elif model_type == "octocoder": return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus))) else: raise ValueError(f"Unknown model type: {model_type}") def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]: batches = batch_prompts_from_example(example, batch_size, completion_limit) completions = [] for batch in batches: resps = model.generate(batch, **kwargs) for resp in resps: completions.append(resp["content"]) return completions ``` Produced: ``` Add system_supported argument to model_factory ```