copyright_checker / analysis.py
aliasgerovs's picture
Updated - Fixed plagiarsim.
af21e05
import yaml
import subprocess
import nltk
from nltk import word_tokenize
from nltk.corpus import cmudict, stopwords
import spacy
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import matplotlib.pyplot as plt
import numpy as np
from predictors import update,update_main, correct_text, split_text
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections import register_projection
from matplotlib.projections.polar import PolarAxes
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
from writing_analysis import (
estimated_slightly_difficult_words_ratio,
entity_density,
determiners_frequency,
punctuation_diversity,
type_token_ratio,
calculate_perplexity,
calculate_syntactic_tree_depth,
hapax_legomena_ratio,
mtld,
)
nltk.download("cmudict")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
d = cmudict.dict()
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
subprocess.run(command)
nlp = spacy.load("en_core_web_sm")
with open("config.yaml", "r") as file:
params = yaml.safe_load(file)
device = "cuda" if torch.cuda.is_available() else "cpu"
readability_model_id = params["READABILITY_MODEL_ID"]
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
def normalize(value, min_value, max_value):
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
return max(0, min(100, normalized_value))
def depth_analysis(input_text, bias_buster_selected):
if bias_buster_selected:
input_text = update(input_text)
usual_ranges = {
"estimated_slightly_difficult_words_ratio": (
0.2273693623058005,
0.557383692351033,
),
"entity_density": (-0.07940776754145815, 0.23491038179986615),
"determiners_frequency": (0.012461059190031154, 0.15700934579439252),
"punctuation_diversity": (-0.21875, 0.53125),
"type_token_ratio": (0.33002482852189063, 1.0894414982357028),
"calculate_perplexity": (-25.110544681549072, 82.4620680809021),
"calculate_syntactic_tree_depth": (
1.8380681818181812,
10.997159090909092,
),
"hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
"mtld": (-84.03125000000001, 248.81875000000002),
}
vocabulary_level = estimated_slightly_difficult_words_ratio(input_text, d)
entity_ratio = entity_density(input_text, nlp)
determiner_use = determiners_frequency(input_text, nlp)
punctuation_variety = punctuation_diversity(input_text)
sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
perplexity = calculate_perplexity(
input_text, gpt2_model, gpt2_tokenizer, device
)
lexical_diversity = type_token_ratio(input_text)
unique_words = hapax_legomena_ratio(input_text)
vocabulary_stability = mtld(input_text)
# normalize between 0 and 100
vocabulary_level_norm = normalize(
vocabulary_level,
*usual_ranges["estimated_slightly_difficult_words_ratio"],
)
entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
determiner_use_norm = normalize(
determiner_use, *usual_ranges["determiners_frequency"]
)
punctuation_variety_norm = normalize(
punctuation_variety, *usual_ranges["punctuation_diversity"]
)
lexical_diversity_norm = normalize(
lexical_diversity, *usual_ranges["type_token_ratio"]
)
unique_words_norm = normalize(
unique_words, *usual_ranges["hapax_legomena_ratio"]
)
vocabulary_stability_norm = normalize(
vocabulary_stability, *usual_ranges["mtld"]
)
sentence_depth_norm = normalize(
sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
)
perplexity_norm = normalize(
perplexity, *usual_ranges["calculate_perplexity"]
)
features = {
"Lexical Diversity": lexical_diversity_norm,
"Vocabulary Level": vocabulary_level_norm,
"Unique Words": unique_words_norm,
"Determiner Use": determiner_use_norm,
"Punctuation Variety": punctuation_variety_norm,
"Sentence Depth": sentence_depth_norm,
"Vocabulary Stability": vocabulary_stability_norm,
"Entity Ratio": entity_ratio_norm,
"Perplexity": perplexity_norm,
}
def radar_factory(num_vars, frame="circle"):
theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
class RadarTransform(PolarAxes.PolarTransform):
def transform_path_non_affine(self, path):
if path._interpolation_steps > 1:
path = path.interpolated(num_vars)
return Path(self.transform(path.vertices), path.codes)
class RadarAxes(PolarAxes):
name = "radar"
PolarTransform = RadarTransform
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.set_theta_zero_location("N")
def fill(self, *args, closed=True, **kwargs):
return super().fill(closed=closed, *args, **kwargs)
def plot(self, *args, **kwargs):
lines = super().plot(*args, **kwargs)
for line in lines:
self._close_line(line)
def _close_line(self, line):
x, y = line.get_data()
if x[0] != x[-1]:
x = np.append(x, x[0])
y = np.append(y, y[0])
line.set_data(x, y)
def set_varlabels(self, labels):
self.set_thetagrids(np.degrees(theta), labels)
def _gen_axes_patch(self):
if frame == "circle":
return Circle((0.5, 0.5), 0.5)
elif frame == "polygon":
return RegularPolygon(
(0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
)
def _gen_axes_spines(self):
if frame == "polygon":
spine = Spine(
axes=self,
spine_type="circle",
path=Path.unit_regular_polygon(num_vars),
)
spine.set_transform(
Affine2D().scale(0.5).translate(0.5, 0.5)
+ self.transAxes
)
return {"polar": spine}
register_projection(RadarAxes)
return theta
N = 9
theta = radar_factory(N, frame="polygon")
data = features.values()
labels = features.keys()
fig, ax = plt.subplots(
subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
)
ax.plot(theta, data)
ax.fill(theta, data, alpha=0.4)
ax.set_varlabels(labels)
rgrids = np.linspace(0, 100, num=6)
ax.set_rgrids(
rgrids,
labels=[f"{round(r)}%" for r in rgrids],
fontsize=8,
color="black",
)
ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
for dd, (label, value) in enumerate(zip(labels, data)):
ax.text(
theta[dd] + 0.1,
value + 5,
f"{value:.0f}",
horizontalalignment="left",
verticalalignment="bottom",
fontsize=8,
)
return fig