hendrydong commited on
Commit
0713a9f
1 Parent(s): f4ac4eb

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -0
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The reward model may be used for iterative SFT/DPO
2
+
3
+ @article{dong2023raft,
4
+ title={Raft: Reward ranked finetuning for generative foundation model alignment},
5
+ author={Dong, Hanze and Xiong, Wei and Goyal, Deepanshu and Pan, Rui and Diao, Shizhe and Zhang, Jipeng and Shum, Kashun and Zhang, Tong},
6
+ journal={arXiv preprint arXiv:2304.06767},
7
+ year={2023}
8
+ }
9
+ @article{xiong2023gibbs,
10
+ title={Gibbs sampling from human feedback: A provable kl-constrained framework for rlhf},
11
+ author={Xiong, Wei and Dong, Hanze and Ye, Chenlu and Zhong, Han and Jiang, Nan and Zhang, Tong},
12
+ journal={arXiv preprint arXiv:2312.11456},
13
+ year={2023}
14
+ }