import os os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/" os.environ["HF_HOME"] = "./cache/hgCache/" import torch from transformers import pipeline from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM import pandas as pd import time import random import torch random.seed(42) # hg dataset name: SedatAl/Turkish_Recipe df = pd.read_parquet("recipe.parquet") print(df) tokenizer = AutoTokenizer.from_pretrained( "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False ) tokenizer.truncation_side = "right" modernBert = AutoModelForMaskedLM.from_pretrained( "99eren99/ModernBERT-base-Turkish-uncased-mlm", ) cosmos = AutoModelForMaskedLM.from_pretrained("ytu-ce-cosmos/turkish-base-bert-uncased") dbmdz = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-turkish-uncased") modernBert.eval() cosmos.eval() dbmdz.eval() modernBert.to("cuda", dtype=torch.float16) print(modernBert.dtype) cosmos.to("cuda") dbmdz.to("cuda") modernBertTrueTokenCount = 0 cosmosTrueTokenCount = 0 dbmdzTrueTokenCount = 0 modernBertElapsedTime = 0 cosmosElapsedTime = 0 dbmdzElapsedTime = 0 def mask_tokens(inputs): inputsCopy = inputs.clone() s = list(range(1, len(inputs[0]) - 1)) random.shuffle(s) masked_indices = s[: int(len(s) * 0.1)] # mask ratio inputsCopy[0][masked_indices] = 4 return inputsCopy, masked_indices def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices): start = time.time() with torch.no_grad(): outputs = model(masked_input_ids) predictions = outputs.logits.cpu() # Get the predicted tokens predicted_index = torch.argmax(predictions[0], dim=-1) trueTokenCount = ( (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1 ).sum() end = time.time() elapsedTime = end - start return trueTokenCount, elapsedTime, predicted_index totalMaskedTokens = 0 from tqdm import tqdm for row in tqdm(df.tarif.values): text = row.replace("I", "ı").lower() inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) masked_input_ids, masked_indices = mask_tokens(inputs.input_ids) masked_input_ids = masked_input_ids.to("cuda") """ print("Original Text:", text) print( "Masked Text:", " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())), ) """ # modernBert trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( modernBert, inputs, masked_input_ids, masked_indices ) modernBertTrueTokenCount += trueTokenCount modernBertElapsedTime += elapsedTime # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index)) # cosmos trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( cosmos, inputs, masked_input_ids, masked_indices ) cosmosTrueTokenCount += trueTokenCount cosmosElapsedTime += elapsedTime # print("Predicted Text Cosmos BERT:", tokenizer.decode(predicted_index)) # dbmdz trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( dbmdz, inputs, masked_input_ids, masked_indices ) dbmdzTrueTokenCount += trueTokenCount dbmdzElapsedTime += elapsedTime # print("Predicted Text BERTurk:", tokenizer.decode(predicted_index)) totalMaskedTokens += len(masked_indices) print(totalMaskedTokens) print(modernBertTrueTokenCount, modernBertElapsedTime) print(cosmosTrueTokenCount, cosmosElapsedTime) print(dbmdzTrueTokenCount, dbmdzElapsedTime) print(modernBertTrueTokenCount / totalMaskedTokens) print(cosmosTrueTokenCount / totalMaskedTokens) print(dbmdzTrueTokenCount / totalMaskedTokens)