Spaces:
Sleeping
Sleeping
import json | |
import datasketch | |
from datasketch import MinHash | |
class SongGuesser: | |
def guess_song(query): | |
with open("./swedish_christmas_songs.json", "r", encoding='utf-8') as f: | |
songs = json.load(f) | |
shingle_size = 5 | |
query_shingles = get_shingles(query, shingle_size) | |
query_minhash = create_minhash(query_shingles) | |
max_sim = 0 | |
max_name = "" | |
for song in songs: | |
song_lyrics = song['lyrics'].lower() | |
song_shingles = get_shingles(song_lyrics, shingle_size) | |
song_minhash = create_minhash(song_shingles) | |
estimated_jaccard = query_minhash.jaccard(song_minhash) | |
if estimated_jaccard > max_sim: | |
max_sim = estimated_jaccard | |
max_name = song['name'] | |
return max_name | |
def get_shingles(text, shingle_size): | |
return set(text[i:i+shingle_size] for i in range(len(text) - shingle_size + 1)) | |
def create_minhash(shingles, num_perm=128): | |
m = MinHash(num_perm=num_perm) | |
for shingle in shingles: | |
m.update(shingle.encode('utf8')) | |
return m | |