File size: 3,835 Bytes
e555415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
os.environ["HF_HOME"] = "./cache/hgCache/"
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
import pandas as pd
import time
import random
import torch
random.seed(42)
# wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
df = pd.read_parquet("qa.parquet")
print(df)
tokenizer = AutoTokenizer.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
)
tokenizer.truncation_side = "right"
modernBert = AutoModelForMaskedLM.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
)
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
modernBert.eval()
cosmos.eval()
dbmdz.eval()
modernBert.to("cuda", dtype=torch.float16)
print(modernBert.dtype)
cosmos.to("cuda")
dbmdz.to("cuda")
modernBertTrueTokenCount = 0
cosmosTrueTokenCount = 0
dbmdzTrueTokenCount = 0
modernBertElapsedTime = 0
cosmosElapsedTime = 0
dbmdzElapsedTime = 0
def mask_tokens(inputs):
inputsCopy = inputs.clone()
s = list(range(1, len(inputs[0]) - 1))
random.shuffle(s)
masked_indices = s[: int(len(s) * 0.05)] # mask ratio
inputsCopy[0][masked_indices] = 4
return inputsCopy, masked_indices
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
start = time.time()
with torch.no_grad():
outputs = model(masked_input_ids)
predictions = outputs.logits.cpu()
# Get the predicted tokens
predicted_index = torch.argmax(predictions[0], dim=-1)
trueTokenCount = (
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
).sum()
end = time.time()
elapsedTime = end - start
return trueTokenCount, elapsedTime, predicted_index
totalMaskedTokens = 0
from tqdm import tqdm
for row in tqdm(df.output.values):
text = row.replace("I", "ı").lower()
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
masked_input_ids = masked_input_ids.to("cuda")
""" print("Original Text:", text)
print(
"Masked Text:",
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
) """
# modernBert
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
modernBert, inputs, masked_input_ids, masked_indices
)
modernBertTrueTokenCount += trueTokenCount
modernBertElapsedTime += elapsedTime
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
# cosmos
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
cosmos, inputs, masked_input_ids, masked_indices
)
cosmosTrueTokenCount += trueTokenCount
cosmosElapsedTime += elapsedTime
# print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
# dbmdz
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
dbmdz, inputs, masked_input_ids, masked_indices
)
dbmdzTrueTokenCount += trueTokenCount
dbmdzElapsedTime += elapsedTime
# print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
totalMaskedTokens += len(masked_indices)
print(totalMaskedTokens)
print(modernBertTrueTokenCount, modernBertElapsedTime)
print(cosmosTrueTokenCount, cosmosElapsedTime)
print(dbmdzTrueTokenCount, dbmdzElapsedTime)
print(modernBertTrueTokenCount / totalMaskedTokens)
print(cosmosTrueTokenCount / totalMaskedTokens)
print(dbmdzTrueTokenCount / totalMaskedTokens)
|