--- license: apache-2.0 library_name: peft tags: - generated_from_trainer metrics: - bleu base_model: google/madlad400-3b-mt model-index: - name: novel_zh2vi results: [] pipeline_tag: translation --- Please remember: one sentence per line, no more than 256 characters per line. (I have very limited hardware) DO NOT USE FP16. Original model was trained using BF16, FP16 will give unexpected result!!! ### Sample Code ```python from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig from tqdm import tqdm from peft import PeftModelForSeq2SeqLM import os import torch from datasets import load_from_disk as load bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model_to_load ='jetaudio/novel_zh2vi' base_model = AutoModelForSeq2SeqLM.from_pretrained('google/madlad400-3b-mt',quantization_config=bnb_config,device_map='auto') model = PeftModelForSeq2SeqLM.from_pretrained(base_model,model_to_load) tokenizer = AutoTokenizer.from_pretrained(model_to_load) text = open('luanhoilacvien_cn.txt','r',encoding='utf8').read() #luanhoilacvien_cn.txt is the sources file written in Chinese def trans(texts,temp=1,top_p=0.8): encodings = tokenizer(texts,return_tensors='pt',padding='max_length', max_length=256).to('cuda') gens = model.generate(input_ids=encodings.input_ids, do_sample=True, max_length=256, temperature=temp, top_p=top_p ) return '\n'.join([tokenizer.decode(gen,skip_special_tokens=True) for gen in gens]) text = text.replace(' ','').replace('\n\n','\n').split('\n') batch_size = 32 texts = [text[i * batch_size:(i + 1) * batch_size] for i in range((len(text) + batch_size - 1) // batch_size)] for sens in tqdm(texts): t = trans(['<2vi>' + sen for sen in sens],temp=0.1,top_p=0.3) with open('./luanhoilacvien_vi.txt','a', encoding='utf8') as fout: #luanhoilacvien_vi.txt is the result in Vietnamese fout.write(t + '\n') ``` ### Framework versions - PEFT 0.7.1 - Transformers 4.36.1 - Pytorch 2.1.2+cu121 - Datasets 2.15.0 - Tokenizers 0.15.0