Spaces:
Sleeping
Sleeping
import string | |
from collections import Counter | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.probability import FreqDist | |
import torch | |
def preprocess_text(text, remove_stopwords=True, use_lemmatization=True): | |
tokens = word_tokenize(text.lower()) | |
tokens = [token for token in tokens if token.isalpha()] | |
if remove_stopwords: | |
stop_words = set(stopwords.words("english")) | |
tokens = [token for token in tokens if token not in stop_words] | |
if use_lemmatization: | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
return tokens | |
def get_special_chars(): | |
import emoji # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI | |
main_special_characters = string.punctuation + string.digits + string.whitespace | |
other_special_characters = ( | |
" ’“”–ー一▬…✦�£•€«»°·═" | |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" | |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" | |
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" | |
"」﴾》" | |
) | |
emoji = list(emoji.UNICODE_EMOJI["en"].keys()) | |
special_characters_default = set(main_special_characters + other_special_characters) | |
special_characters_default.update(emoji) | |
return special_characters_default | |
special_characters_default = get_special_chars() | |
# -------------------- Features -------------------- | |
def syllable_count(word, d): | |
return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])] | |
def estimated_slightly_difficult_words_ratio(text, d): | |
words = word_tokenize(text.lower()) | |
total_words = len(words) | |
# Considering words with 3 or more syllables as difficult | |
difficult_count = sum( | |
1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2 | |
) | |
return difficult_count / total_words if total_words > 0 else 0 | |
# -------------------- Features -------------------- | |
def entity_density(text, nlp): | |
doc = nlp(text) | |
return len(doc.ents) / len(doc) | |
# -------------------- Features -------------------- | |
def determiners_frequency(text, nlp): | |
doc = nlp(text) | |
determiners = sum(1 for token in doc if token.pos_ == "DET") | |
total_words = len(doc) | |
return determiners / total_words if total_words else 0 | |
# -------------------- Features -------------------- | |
def punctuation_diversity(text): | |
punctuation_counts = Counter( | |
char for char in text if char in special_characters_default | |
) | |
diversity_score = ( | |
len(punctuation_counts) / len(special_characters_default) | |
if special_characters_default | |
else 0 | |
) | |
return diversity_score | |
# -------------------- Features -------------------- | |
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True): | |
tokens = preprocess_text(text, remove_stopwords, use_lemmatization) | |
unique_words = set(tokens) | |
return len(unique_words) / len(tokens) if tokens else 0 | |
# -------------------- Features -------------------- | |
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True): | |
tokens = word_tokenize(text.lower()) | |
tokens = [token for token in tokens if token.isalpha()] | |
if remove_stopwords: | |
stop_words = set(stopwords.words("english")) | |
tokens = [token for token in tokens if token not in stop_words] | |
if use_lemmatization: | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
freq_dist = FreqDist(tokens) | |
hapaxes = freq_dist.hapaxes() | |
return len(hapaxes) / len(tokens) if tokens else 0 | |
# -------------------- Features -------------------- | |
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True): | |
tokens = preprocess_text(text, remove_stopwords, use_lemmatization) | |
def mtld_calc(direction): | |
token_length, factor_count = 0, 0 | |
types = set() | |
for token in tokens if direction == "forward" else reversed(tokens): | |
types.add(token) | |
token_length += 1 | |
if len(types) / token_length < threshold: | |
factor_count += 1 | |
types = set() | |
token_length = 0 | |
factor_count += 1 # For the last segment, even if it didn't reach the threshold | |
return len(tokens) / factor_count if factor_count != 0 else 0 | |
return (mtld_calc("forward") + mtld_calc("backward")) / 2 | |
# -------------------- Features -------------------- | |
def calculate_max_depth(sent): | |
return max(len(list(token.ancestors)) for token in sent) | |
def calculate_syntactic_tree_depth(text, nlp): | |
doc = nlp(text) | |
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents] | |
average_depth = ( | |
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0 | |
) | |
return average_depth | |
# -------------------- Features -------------------- | |
def calculate_perplexity(text, model, tokenizer, device, stride=512): | |
encodings = tokenizer(text, return_tensors="pt") | |
max_length = model.config.n_positions | |
seq_len = encodings.input_ids.size(1) | |
nlls = [] | |
prev_end_loc = 0 | |
for begin_loc in range(0, seq_len, stride): | |
end_loc = min(begin_loc + max_length, seq_len) | |
trg_len = end_loc - prev_end_loc # may be different from stride on last loop | |
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) | |
target_ids = input_ids.clone() | |
target_ids[:, :-trg_len] = -100 | |
with torch.no_grad(): | |
outputs = model(input_ids, labels=target_ids) | |
# loss is calculated using CrossEntropyLoss which averages over valid labels | |
# N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels | |
# to the left by 1. | |
neg_log_likelihood = outputs.loss | |
nlls.append(neg_log_likelihood) | |
prev_end_loc = end_loc | |
if end_loc == seq_len: | |
break | |
ppl = torch.exp(torch.stack(nlls).mean()) | |
return ppl.item() | |