|
import os |
|
|
|
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/" |
|
os.environ["HF_HOME"] = "./cache/hgCache/" |
|
|
|
import torch |
|
from transformers import pipeline |
|
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM |
|
import pandas as pd |
|
import time |
|
import random |
|
import torch |
|
|
|
random.seed(42) |
|
|
|
|
|
df = pd.read_parquet("qa.parquet") |
|
print(df) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False |
|
) |
|
tokenizer.truncation_side = "right" |
|
|
|
modernBert = AutoModelForMaskedLM.from_pretrained( |
|
"99eren99/ModernBERT-base-Turkish-uncased-mlm", |
|
) |
|
|
|
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased") |
|
|
|
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased") |
|
|
|
modernBert.eval() |
|
cosmos.eval() |
|
dbmdz.eval() |
|
|
|
modernBert.to("cuda", dtype=torch.float16) |
|
print(modernBert.dtype) |
|
cosmos.to("cuda") |
|
dbmdz.to("cuda") |
|
|
|
|
|
modernBertTrueTokenCount = 0 |
|
cosmosTrueTokenCount = 0 |
|
dbmdzTrueTokenCount = 0 |
|
|
|
modernBertElapsedTime = 0 |
|
cosmosElapsedTime = 0 |
|
dbmdzElapsedTime = 0 |
|
|
|
|
|
def mask_tokens(inputs): |
|
inputsCopy = inputs.clone() |
|
|
|
s = list(range(1, len(inputs[0]) - 1)) |
|
random.shuffle(s) |
|
|
|
masked_indices = s[: int(len(s) * 0.05)] |
|
|
|
inputsCopy[0][masked_indices] = 4 |
|
|
|
return inputsCopy, masked_indices |
|
|
|
|
|
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices): |
|
start = time.time() |
|
with torch.no_grad(): |
|
outputs = model(masked_input_ids) |
|
predictions = outputs.logits.cpu() |
|
|
|
|
|
predicted_index = torch.argmax(predictions[0], dim=-1) |
|
|
|
trueTokenCount = ( |
|
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1 |
|
).sum() |
|
|
|
end = time.time() |
|
elapsedTime = end - start |
|
|
|
return trueTokenCount, elapsedTime, predicted_index |
|
|
|
|
|
totalMaskedTokens = 0 |
|
|
|
from tqdm import tqdm |
|
|
|
for row in tqdm(df.output.values): |
|
text = row.replace("I", "ı").lower() |
|
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) |
|
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids) |
|
|
|
masked_input_ids = masked_input_ids.to("cuda") |
|
|
|
""" print("Original Text:", text) |
|
print( |
|
"Masked Text:", |
|
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())), |
|
) """ |
|
|
|
|
|
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( |
|
modernBert, inputs, masked_input_ids, masked_indices |
|
) |
|
modernBertTrueTokenCount += trueTokenCount |
|
modernBertElapsedTime += elapsedTime |
|
|
|
|
|
|
|
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( |
|
cosmos, inputs, masked_input_ids, masked_indices |
|
) |
|
cosmosTrueTokenCount += trueTokenCount |
|
cosmosElapsedTime += elapsedTime |
|
|
|
|
|
|
|
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( |
|
dbmdz, inputs, masked_input_ids, masked_indices |
|
) |
|
dbmdzTrueTokenCount += trueTokenCount |
|
dbmdzElapsedTime += elapsedTime |
|
|
|
|
|
totalMaskedTokens += len(masked_indices) |
|
|
|
print(totalMaskedTokens) |
|
print(modernBertTrueTokenCount, modernBertElapsedTime) |
|
print(cosmosTrueTokenCount, cosmosElapsedTime) |
|
print(dbmdzTrueTokenCount, dbmdzElapsedTime) |
|
|
|
print(modernBertTrueTokenCount / totalMaskedTokens) |
|
print(cosmosTrueTokenCount / totalMaskedTokens) |
|
print(dbmdzTrueTokenCount / totalMaskedTokens) |
|
|