|
import math |
|
|
|
from rapidfuzz import fuzz |
|
import re |
|
import regex |
|
from statistics import mean |
|
|
|
CHUNK_MIN_CHARS = 25 |
|
|
|
def chunk_text(text, chunk_len=500): |
|
chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)] |
|
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] |
|
return chunks |
|
|
|
|
|
def overlap_score(hypothesis_chunks, reference_chunks): |
|
length_modifier = len(hypothesis_chunks) / len(reference_chunks) |
|
search_distance = max(len(reference_chunks) // 5, 10) |
|
chunk_scores = [] |
|
for i, hyp_chunk in enumerate(hypothesis_chunks): |
|
max_score = 0 |
|
total_len = 0 |
|
i_offset = int(i * length_modifier) |
|
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) |
|
for j in chunk_range: |
|
ref_chunk = reference_chunks[j] |
|
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 |
|
if score > max_score: |
|
max_score = score |
|
total_len = len(ref_chunk) |
|
chunk_scores.append(max_score) |
|
return chunk_scores |
|
|
|
|
|
def score_text(hypothesis, reference): |
|
|
|
hypothesis_chunks = chunk_text(hypothesis) |
|
reference_chunks = chunk_text(reference) |
|
chunk_scores = overlap_score(hypothesis_chunks, reference_chunks) |
|
return mean(chunk_scores) |