IDEA-CCNL
/

Randeng-DELLA-226M-Chinese

@@ -26,70 +26,50 @@ A deep VAE model pretrained on Wudao dataset. Both encoder and decoder are based
 ## 模型信息 Model Information
-参考论文：[Fuse It More Deeply! A Variational Transformer with Layer-Wise Latent Variable Inference for Text Generation](https://arxiv.org/abs/2207.06130)
 ## 使用 Usage
 ```python
 # Checkout the latest Fengshenbang-LM directory and run following script under Fengshenbang-LM root directory
-import sys
 import torch
-import argparse
 from torch.nn.utils.rnn import pad_sequence
-from fengshen.models.deepVAE.vae_pl_module import DeepVAEModule
-if __name__ == "__main__":
-    # TODO: Update this path to the downloaded directory
-    checkpoint_path = '..../Randeng-DELLA-226M-Chinese'
-    gpt2_model_path = '..../Randeng-DELLA-226M-Chinese'
-    args_parser = argparse.ArgumentParser()
-    args_parser.add_argument("--checkpoint_path", type=str, default=checkpoint_path)
-    args_parser.add_argument("--gpt2_model_path", type=str, default=gpt2_model_path)
-    args_parser.add_argument("--latent_dim", type=int, default=256)
-    args_parser.add_argument("--beta_kl_constraints_start", type=float, default=1e-5)
-    args_parser.add_argument("--beta_kl_constraints_stop", type=float, default=1.)
-    args_parser.add_argument("--beta_n_cycles", type=int, default=10)
-    args_parser.add_argument("--latent_lmf_rank", type=int, default=4)
-    args_parser.add_argument("--CVAE", action='store_true')
-    args_parser.add_argument("--share_param", action='store_false',
-        help="specify this argument if we want to share dec's and enc's params")
-    args, unknown_args = args_parser.parse_known_args()
-    # load model
-    model, tokenizer =  DeepVAEModule.load_model(args, labels_dict=None)
-    # VAE generation
-    sentence =  "本模型是在通用数据集下预训练的VAE模型，如要获得最佳效果请在特定领域微调后使用。"
-    tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
-    decoder_target = [tokenizer.bos_token_id] + tokenized_text + [tokenizer.eos_token_id]
-    inputs = []
-    inputs.append(torch.tensor(decoder_target, dtype=torch.long))
-    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
-    max_length = 256
-    top_p = 0.5
-    top_k = 0
-    temperature = .7
-    repetition_penalty = 1.0
-    sample = False
-    device = 0
-    model = model.eval()
-    model = model.to(device)
-    outputs = model.inference(inputs.to(device), top_p=top_p, top_k=top_k, max_length=max_length, sample=sample,
-        temperature=temperature, repetition_penalty=repetition_penalty)
-    for gen_sent, orig_sent in zip(outputs, inputs):
-        print('orig_sent:', tokenizer.decode(orig_sent).replace(' ', ''))
-        print('gen_sent:', tokenizer.decode(gen_sent).replace(' ', ''))
-        print("-"*20)
 ```

 ## 模型信息 Model Information
+参考论文 Reference Paper：[Fuse It More Deeply! A Variational Transformer with Layer-Wise Latent Variable Inference for Text Generation](https://arxiv.org/abs/2207.06130)
+本模型使用了Della论文里的循环潜在向量架构，但对于解码器生成并未采用原论文的low-rank-tensor-product来进行信息融合，而是使用了简单的线性变换后逐位逐词添加的方式。该方式对于开放域数据集的预训练稳定性有较大正向作用。
+Note that although we adopted the layer-wise recurrent latent variables structure as the paper, we did not use the low-rank-tensor-product to fuse the latent vectors to the decoder hidden states. Instead we applied a simple linear transformation on the latent vectors and then add them to the hidden states independently.
 ## 使用 Usage
 ```python
 # Checkout the latest Fengshenbang-LM directory and run following script under Fengshenbang-LM root directory
 import torch
 from torch.nn.utils.rnn import pad_sequence
+from fengshen.models.deepVAE.deep_vae import Della
+from transformers.models.bert.tokenization_bert import BertTokenizer
+tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-DELLA-226M-Chinese")
+vae_model = Della.from_pretrained("IDEA-CCNL/Randeng-DELLA-226M-Chinese")
+special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>'}
+tokenizer.add_special_tokens(special_tokens_dict)
+sentence =  "本模型是在通用数据集下预训练的VAE模型，如要获得最佳效果请在特定领域微调后使用。"
+tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
+decoder_target = [tokenizer.bos_token_id] + tokenized_text + [tokenizer.eos_token_id]
+inputs = []
+inputs.append(torch.tensor(decoder_target, dtype=torch.long))
+inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
+max_length = 256
+top_p = 0.5
+top_k = 0
+temperature = .7
+repetition_penalty = 1.0
+sample = False
+device = 0
+model = vae_model.eval()
+model = model.to(device)
+outputs = model.model.inference(inputs.to(device), top_p=top_p, top_k=top_k, max_length=max_length, sample=sample,
+    temperature=temperature, repetition_penalty=repetition_penalty)
+for gen_sent, orig_sent in zip(outputs, inputs):
+    print('orig_sent:', tokenizer.decode(orig_sent).replace(' ', ''))
+    print('gen_sent:', tokenizer.decode(gen_sent).replace(' ', ''))
+    print("-"*20)
 ```