Spaces:
Build error
Build error
import os | |
import re | |
import math | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
import seaborn as sns | |
import nltk | |
import evaluate | |
meteor = evaluate.load("meteor") | |
print(f"loading: {__file__}") | |
# final version | |
pattern_excessive_whitespaces = re.compile(r"\s{5,}") | |
pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL) | |
def del_excessive_whitespaces(text, debug=False): | |
count = 0 | |
if isinstance(text, str): | |
if debug: | |
print("----detect excessive whitespaces----") | |
count = len(text) | |
text = pattern_excessive_whitespaces.sub("", text) | |
count -= len(text) | |
if debug and count: | |
print(f"removed excessive whitespaces: {count}") | |
return text, count | |
# final version for repetition detection | |
def detect_text_repetitions(text, debug=False): | |
count = 0 | |
if isinstance(text, str): | |
if debug: | |
print("----detect text repetitions----") | |
matches = pattern_text_repetitions.finditer(text) | |
for match in matches: | |
if debug: | |
print(match) | |
for groupNum in range(0, len(match.groups())): | |
groupNum = groupNum + 1 | |
print( | |
"Group {groupNum} found at {start}-{end}: `{group}`".format( | |
groupNum=groupNum, | |
start=match.start(groupNum), | |
end=match.end(groupNum), | |
group=match.group(groupNum), | |
) | |
) | |
start, end = match.span() | |
count += end - start | |
return count | |
def detect_repetitions(text, debug=False): | |
text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug) | |
count_text_repetitions = detect_text_repetitions(text, debug=debug) | |
total_repetitions = count_excessive_whitespaces + count_text_repetitions | |
result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions) | |
if debug: | |
print(result) | |
return result | |
def detect_scores(text, debug=False): | |
newline_score, repetition_score, total_repetitions = detect_repetitions( | |
text, debug=debug | |
) | |
return pd.Series([newline_score, repetition_score, total_repetitions]) | |