Spaces:

Ritvik19
/

marker-io

Runtime error

marker-io / marker /benchmark /scoring.py

Add all files and directories

c8a32e7 9 months ago

1.38 kB

	import math

	from rapidfuzz import fuzz
	import re
	import regex
	from statistics import mean

	CHUNK_MIN_CHARS = 25

	def chunk_text(text, chunk_len=500):
	chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
	chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
	return chunks


	def overlap_score(hypothesis_chunks, reference_chunks):
	length_modifier = len(hypothesis_chunks) / len(reference_chunks)
	search_distance = max(len(reference_chunks) // 5, 10)
	chunk_scores = []
	for i, hyp_chunk in enumerate(hypothesis_chunks):
	max_score = 0
	total_len = 0
	i_offset = int(i * length_modifier)
	chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
	for j in chunk_range:
	ref_chunk = reference_chunks[j]
	score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
	if score > max_score:
	max_score = score
	total_len = len(ref_chunk)
	chunk_scores.append(max_score)
	return chunk_scores


	def score_text(hypothesis, reference):
	# Returns a 0-1 alignment score
	hypothesis_chunks = chunk_text(hypothesis)
	reference_chunks = chunk_text(reference)
	chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
	return mean(chunk_scores)