99eren99's picture
Upload 8 files
e555415 verified
import os
os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/"
os.environ["HF_HOME"] = "./cache/hgCache/"
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM
import pandas as pd
import time
import random
import torch
random.seed(42)
# wget https://huggingface.co/datasets/blackerx/turkish_v2/resolve/main/data/train-00000-of-00001.parquet
df = pd.read_parquet("qa.parquet")
print(df)
tokenizer = AutoTokenizer.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False
)
tokenizer.truncation_side = "right"
modernBert = AutoModelForMaskedLM.from_pretrained(
"99eren99/ModernBERT-base-Turkish-uncased-mlm",
)
cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased")
dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased")
modernBert.eval()
cosmos.eval()
dbmdz.eval()
modernBert.to("cuda", dtype=torch.float16)
print(modernBert.dtype)
cosmos.to("cuda")
dbmdz.to("cuda")
modernBertTrueTokenCount = 0
cosmosTrueTokenCount = 0
dbmdzTrueTokenCount = 0
modernBertElapsedTime = 0
cosmosElapsedTime = 0
dbmdzElapsedTime = 0
def mask_tokens(inputs):
inputsCopy = inputs.clone()
s = list(range(1, len(inputs[0]) - 1))
random.shuffle(s)
masked_indices = s[: int(len(s) * 0.05)] # mask ratio
inputsCopy[0][masked_indices] = 4
return inputsCopy, masked_indices
def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices):
start = time.time()
with torch.no_grad():
outputs = model(masked_input_ids)
predictions = outputs.logits.cpu()
# Get the predicted tokens
predicted_index = torch.argmax(predictions[0], dim=-1)
trueTokenCount = (
(inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1
).sum()
end = time.time()
elapsedTime = end - start
return trueTokenCount, elapsedTime, predicted_index
totalMaskedTokens = 0
from tqdm import tqdm
for row in tqdm(df.output.values):
text = row.replace("I", "ı").lower()
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
masked_input_ids, masked_indices = mask_tokens(inputs.input_ids)
masked_input_ids = masked_input_ids.to("cuda")
""" print("Original Text:", text)
print(
"Masked Text:",
" ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())),
) """
# modernBert
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
modernBert, inputs, masked_input_ids, masked_indices
)
modernBertTrueTokenCount += trueTokenCount
modernBertElapsedTime += elapsedTime
# print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index))
# cosmos
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
cosmos, inputs, masked_input_ids, masked_indices
)
cosmosTrueTokenCount += trueTokenCount
cosmosElapsedTime += elapsedTime
# print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index))
# dbmdz
trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime(
dbmdz, inputs, masked_input_ids, masked_indices
)
dbmdzTrueTokenCount += trueTokenCount
dbmdzElapsedTime += elapsedTime
# print("Predicted Text BERTurk:", tokenizer.decode(predicted_index))
totalMaskedTokens += len(masked_indices)
print(totalMaskedTokens)
print(modernBertTrueTokenCount, modernBertElapsedTime)
print(cosmosTrueTokenCount, cosmosElapsedTime)
print(dbmdzTrueTokenCount, dbmdzElapsedTime)
print(modernBertTrueTokenCount / totalMaskedTokens)
print(cosmosTrueTokenCount / totalMaskedTokens)
print(dbmdzTrueTokenCount / totalMaskedTokens)