import os os.environ["TRANSFORMERS_CACHE"] = "./cache/transformersCache/" os.environ["HF_HOME"] = "./cache/hgCache/" import torch from transformers import pipeline from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForMaskedLM import pandas as pd import time import random import torch random.seed(42) # wget https://huggingface.co/datasets/Gliscor/turkishReviews-ds-mini/resolve/main/data/train-00000-of-00001.parquet df = pd.read_parquet("reviews.parquet") print(df) tokenizer = AutoTokenizer.from_pretrained( "99eren99/ModernBERT-base-Turkish-uncased-mlm", do_lower_case=False ) tokenizer.truncation_side = "right" modernBert = AutoModelForMaskedLM.from_pretrained( "99eren99/ModernBERT-base-Turkish-uncased-mlm", ) modernBert.eval() modernBert.to("cuda", dtype=torch.float16) print(modernBert.dtype) modernBertTrueTokenCount = 0 modernBertElapsedTime = 0 def mask_tokens(inputs): inputsCopy = inputs.clone() s = list(range(1, len(inputs[0]) - 1)) random.shuffle(s) masked_indices = s[: int(len(s) * 0.15)] # mask ratio inputsCopy[0][masked_indices] = 4 return inputsCopy, masked_indices def getTrueTokenCountAndElapsedTime(model, inputs, masked_input_ids, masked_indices): start = time.time() with torch.no_grad(): outputs = model(masked_input_ids) predictions = outputs.logits.cpu() # Get the predicted tokens predicted_index = torch.argmax(predictions[0], dim=-1) trueTokenCount = ( (inputs.input_ids[0, masked_indices] == predicted_index[masked_indices]) * 1 ).sum() end = time.time() elapsedTime = end - start return trueTokenCount, elapsedTime, predicted_index totalMaskedTokens = 0 from tqdm import tqdm concatenatedText = "" for row in tqdm(df.review.values): text = row.replace("I", "ı").lower() concatenatedText += text if len(concatenatedText.split()) > 6000: inputs = tokenizer( concatenatedText, return_tensors="pt", max_length=8192, truncation=True ) masked_input_ids, masked_indices = mask_tokens(inputs.input_ids) masked_input_ids = masked_input_ids.to("cuda") """ print("Original Text:", text) print(len(masked_input_ids[0])) print( "Masked Text:", " ".join(tokenizer.convert_ids_to_tokens(masked_input_ids[0].tolist())), ) """ # modernBert trueTokenCount, elapsedTime, predicted_index = getTrueTokenCountAndElapsedTime( modernBert, inputs, masked_input_ids, masked_indices ) modernBertTrueTokenCount += trueTokenCount modernBertElapsedTime += elapsedTime # print("Predicted Text ModernBERT:", tokenizer.decode(predicted_index)) totalMaskedTokens += len(masked_indices) concatenatedText = "" print(totalMaskedTokens) print(modernBertTrueTokenCount, modernBertElapsedTime) print(modernBertTrueTokenCount / totalMaskedTokens)