Genstruct-7B / README.md
euclaise's picture
Upload tokenizer
495f58e verified
|
raw
history blame
2.81 kB
metadata
{}
pre_text = "The following is an interaction between a user and an AI assistant that is related to the above text."                                                                                                                                                                                                                                                                                                                  
def ds_map_fn(row):                                                                                                                                                                                             
    input =  f"[[[Title]]] {row['title'].strip()}\n[[[Content]]] {row['context'].strip()}\n\n" + pre_text + "\n\n[[[User]]] "                                                                                     
    output = f"{row['question'].strip()}\n[[[Assistant]]] {row['answer'].strip()}"                                                                                                                                
                                                                                                                                                                                                                  
    input = tokenizer.encode(input, add_special_tokens=False)                                                                                                                                                     
    output = tokenizer.encode(output, add_special_tokens=False)                                                                                                                                                   
                                                                                                                                                                                                                  
    input_ids = input + output + [tokenizer.eos_token_id]                                                                                                                                                         
    labels = [-100]*len(input) + output + [tokenizer.eos_token_id]                                                                                                                                                
                                                                                                                                                                                                                  
    return {'input_ids': input_ids, 'labels': labels}                                                                                                                                                             
ds = ds.map(ds_map_fn, remove_columns=ds.column_names)