Spaces:
Running
Running
import yaml | |
import subprocess | |
import nltk | |
from nltk import word_tokenize | |
from nltk.corpus import cmudict, stopwords | |
import spacy | |
import torch | |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from matplotlib.patches import Circle, RegularPolygon | |
from matplotlib.path import Path | |
from matplotlib.projections import register_projection | |
from matplotlib.projections.polar import PolarAxes | |
from matplotlib.spines import Spine | |
from matplotlib.transforms import Affine2D | |
from writing_analysis import ( | |
estimated_slightly_difficult_words_ratio, | |
entity_density, | |
determiners_frequency, | |
punctuation_diversity, | |
type_token_ratio, | |
calculate_perplexity, | |
calculate_syntactic_tree_depth, | |
hapax_legomena_ratio, | |
mtld, | |
) | |
nltk.download("cmudict") | |
nltk.download("punkt") | |
nltk.download("stopwords") | |
nltk.download("wordnet") | |
d = cmudict.dict() | |
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"] | |
subprocess.run(command) | |
nlp = spacy.load("en_core_web_sm") | |
with open("config.yaml", "r") as file: | |
params = yaml.safe_load(file) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
readability_model_id = params["READABILITY_MODEL_ID"] | |
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device) | |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id) | |
def normalize(value, min_value, max_value): | |
normalized_value = ((value - min_value) * 100) / (max_value - min_value) | |
return max(0, min(100, normalized_value)) | |
def depth_analysis(input_text, bias_buster_selected): | |
if bias_buster_selected: | |
text = update(text) | |
usual_ranges = { | |
"estimated_slightly_difficult_words_ratio": ( | |
0.2273693623058005, | |
0.557383692351033, | |
), | |
"entity_density": (-0.07940776754145815, 0.23491038179986615), | |
"determiners_frequency": (0.012461059190031154, 0.15700934579439252), | |
"punctuation_diversity": (-0.21875, 0.53125), | |
"type_token_ratio": (0.33002482852189063, 1.0894414982357028), | |
"calculate_perplexity": (-25.110544681549072, 82.4620680809021), | |
"calculate_syntactic_tree_depth": ( | |
1.8380681818181812, | |
10.997159090909092, | |
), | |
"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778), | |
"mtld": (-84.03125000000001, 248.81875000000002), | |
} | |
vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d) | |
entity_ratio = entity_density(input_text, nlp) | |
determiner_use = determiners_frequency(input_text, nlp) | |
punctuation_variety = punctuation_diversity(input_text) | |
sentence_depth = calculate_syntactic_tree_depth(input_text, nlp) | |
perplexity = calculate_perplexity( | |
input_text, gpt2_model, gpt2_tokenizer, device | |
) | |
lexical_diversity = type_token_ratio(input_text) | |
unique_words = hapax_legomena_ratio(input_text) | |
vocabulary_stability = mtld(input_text) | |
# normalize between 0 and 100 | |
vocabulary_level_norm = normalize( | |
vocabulary_level, | |
*usual_ranges["estimated_slightly_difficult_words_ratio"], | |
) | |
entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"]) | |
determiner_use_norm = normalize( | |
determiner_use, *usual_ranges["determiners_frequency"] | |
) | |
punctuation_variety_norm = normalize( | |
punctuation_variety, *usual_ranges["punctuation_diversity"] | |
) | |
lexical_diversity_norm = normalize( | |
lexical_diversity, *usual_ranges["type_token_ratio"] | |
) | |
unique_words_norm = normalize( | |
unique_words, *usual_ranges["hapax_legomena_ratio"] | |
) | |
vocabulary_stability_norm = normalize( | |
vocabulary_stability, *usual_ranges["mtld"] | |
) | |
sentence_depth_norm = normalize( | |
sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"] | |
) | |
perplexity_norm = normalize( | |
perplexity, *usual_ranges["calculate_perplexity"] | |
) | |
features = { | |
"Lexical Diversity": lexical_diversity_norm, | |
"Vocabulary Level": vocabulary_level_norm, | |
"Unique Words": unique_words_norm, | |
"Determiner Use": determiner_use_norm, | |
"Punctuation Variety": punctuation_variety_norm, | |
"Sentence Depth": sentence_depth_norm, | |
"Vocabulary Stability": vocabulary_stability_norm, | |
"Entity Ratio": entity_ratio_norm, | |
"Perplexity": perplexity_norm, | |
} | |
def radar_factory(num_vars, frame="circle"): | |
theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) | |
class RadarTransform(PolarAxes.PolarTransform): | |
def transform_path_non_affine(self, path): | |
if path._interpolation_steps > 1: | |
path = path.interpolated(num_vars) | |
return Path(self.transform(path.vertices), path.codes) | |
class RadarAxes(PolarAxes): | |
name = "radar" | |
PolarTransform = RadarTransform | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.set_theta_zero_location("N") | |
def fill(self, *args, closed=True, **kwargs): | |
return super().fill(closed=closed, *args, **kwargs) | |
def plot(self, *args, **kwargs): | |
lines = super().plot(*args, **kwargs) | |
for line in lines: | |
self._close_line(line) | |
def _close_line(self, line): | |
x, y = line.get_data() | |
if x[0] != x[-1]: | |
x = np.append(x, x[0]) | |
y = np.append(y, y[0]) | |
line.set_data(x, y) | |
def set_varlabels(self, labels): | |
self.set_thetagrids(np.degrees(theta), labels) | |
def _gen_axes_patch(self): | |
if frame == "circle": | |
return Circle((0.5, 0.5), 0.5) | |
elif frame == "polygon": | |
return RegularPolygon( | |
(0.5, 0.5), num_vars, radius=0.5, edgecolor="k" | |
) | |
def _gen_axes_spines(self): | |
if frame == "polygon": | |
spine = Spine( | |
axes=self, | |
spine_type="circle", | |
path=Path.unit_regular_polygon(num_vars), | |
) | |
spine.set_transform( | |
Affine2D().scale(0.5).translate(0.5, 0.5) | |
+ self.transAxes | |
) | |
return {"polar": spine} | |
register_projection(RadarAxes) | |
return theta | |
N = 9 | |
theta = radar_factory(N, frame="polygon") | |
data = features.values() | |
labels = features.keys() | |
fig, ax = plt.subplots( | |
subplot_kw=dict(projection="radar"), figsize=(7.5, 5) | |
) | |
ax.plot(theta, data) | |
ax.fill(theta, data, alpha=0.4) | |
ax.set_varlabels(labels) | |
rgrids = np.linspace(0, 100, num=6) | |
ax.set_rgrids( | |
rgrids, | |
labels=[f"{round(r)}%" for r in rgrids], | |
fontsize=8, | |
color="black", | |
) | |
ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5) | |
for dd, (label, value) in enumerate(zip(labels, data)): | |
ax.text( | |
theta[dd] + 0.1, | |
value + 5, | |
f"{value:.0f}", | |
horizontalalignment="left", | |
verticalalignment="bottom", | |
fontsize=8, | |
) | |
return fig | |