import pandas as pd | |
# first need to download from haoranxu/ALMA-R-Preference(https://huggingface.co/datasets/haoranxu/ALMA-R-Preference) | |
# Parquet to txt | |
df = pd.read_parquet('haoranxu-ALMA-R-Preference.parquet') | |
print(df.columns) | |
# text_column = df[['alma_en', 'alma_zh', 'en', 'gpt4_en', 'gpt4_zh', 'zh']] | |
# text_column = df[['en', 'zh']] | |
# save txt | |
with open('haoranxu-ALMA-R-Preference-en-zh--zh-en.txt', 'w', encoding='utf-8') as f: | |
for item in df['translation']: | |
en_text = item.get('en') | |
zh_text = item.get('zh') | |
if en_text and zh_text: # check 'en' and 'zh' | |
f.write(f"English: {en_text}\nChinese: {zh_text}\n\n") | |
f.write(f"Chinese: {zh_text}\nEnglish: {en_text}\n\n") | |
# then u can use it to make your language imatrix.dat |