machine-translation / eval_modules /calc_repetitions.py
dh-mc's picture
initial code for Chinese/English translation
3860729
raw
history blame
2.36 kB
import os
import re
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import nltk
import evaluate
meteor = evaluate.load("meteor")
print(f"loading: {__file__}")
# final version
pattern_excessive_whitespaces = re.compile(r"\s{5,}")
pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
def del_excessive_whitespaces(text, debug=False):
count = 0
if isinstance(text, str):
if debug:
print("----detect excessive whitespaces----")
count = len(text)
text = pattern_excessive_whitespaces.sub("", text)
count -= len(text)
if debug and count:
print(f"removed excessive whitespaces: {count}")
return text, count
# final version for repetition detection
def detect_text_repetitions(text, debug=False):
count = 0
if isinstance(text, str):
if debug:
print("----detect text repetitions----")
matches = pattern_text_repetitions.finditer(text)
for match in matches:
if debug:
print(match)
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print(
"Group {groupNum} found at {start}-{end}: `{group}`".format(
groupNum=groupNum,
start=match.start(groupNum),
end=match.end(groupNum),
group=match.group(groupNum),
)
)
start, end = match.span()
count += end - start
return count
def detect_repetitions(text, debug=False):
text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug)
count_text_repetitions = detect_text_repetitions(text, debug=debug)
total_repetitions = count_excessive_whitespaces + count_text_repetitions
result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions)
if debug:
print(result)
return result
def detect_scores(text, debug=False):
newline_score, repetition_score, total_repetitions = detect_repetitions(
text, debug=debug
)
return pd.Series([newline_score, repetition_score, total_repetitions])