import numpy as np import gradio as gr import pandas as pd from sklearn.metrics.pairwise import cosine_similarity import plotly.express as px import plotly.graph_objects as go import umap embedding_df = pd.read_csv('all-MiniLM-L12-v2_embeddings.csv') embeddings = np.array(embedding_df.drop('id', axis=1)) feature_df = pd.read_csv('feature_df.csv', index_col=0) feature_df= (feature_df - feature_df.mean() ) / feature_df.std() #standardize info_df = pd.read_csv('song_info_df.csv') info_df.sort_values(['artist_name','song_title'], inplace=True) def feature_similarity(song_id): std_drop = 4 #drop songs with strange values song_vec = feature_df[feature_df.index.isin([song_id])].to_numpy() songs_matrix = feature_df[~feature_df.index.isin([song_id])].copy() songs_matrix = songs_matrix[(songs_matrix high similarity return pd.DataFrame({'song_id': song_ids, 'feature_similarity': similarities}) def embedding_similarity(song_id): song_index = embedding_df[embedding_df.id==song_id].index.values[0] song_ids = embedding_df[embedding_df.id != song_id].id.to_list() emb_matrix = np.delete(np.copy(embeddings), song_index, axis=0) similarities = cosine_similarity(emb_matrix, np.expand_dims(np.copy(embeddings[song_index,:]), axis=0)) return pd.DataFrame({'song_id': song_ids, 'cosine_similarity': similarities[:,0]}) def decode(song_id): temp_df = info_df[info_df.song_id == song_id] artist = temp_df.artist_name.values[0] song = temp_df.song_title.values[0] youtube_url = f"""{song}""" url = f''' {song} by {artist}''' return url def plot(artist, song): plot_df['color'] = 'blue' plot_df.loc[(plot_df.artist_name==artist) & (plot_df.song_title==song), 'color'] = 'red' plot_df['size'] = 1.5 plot_df.loc[(plot_df.artist_name==artist) & (plot_df.song_title==song), 'size'] = 3 try: fig2.data=[] except: pass fig2 = px.scatter(plot_df[~((plot_df.artist_name==artist) & (plot_df.song_title==song))], 'x', 'y', template='simple_white', hover_data=['artist_name', 'song_title']).update_traces(marker_size=1.5, marker_opacity=0.7) fig2.add_trace(go.Scatter(x=[plot_df.loc[(plot_df.artist_name==artist) & (plot_df.song_title==song), 'x'].values[0]], y=[plot_df.loc[(plot_df.artist_name==artist) & (plot_df.song_title==song), 'y'].values[0]], mode = 'markers', marker_color='red', hovertemplate="Your selected song", marker_size = 4)) fig2.update_xaxes(visible=False) fig2.update_yaxes(visible=False).update_layout(height = 800, width =1500, showlegend=False, title = { 'text': "UMAP Projection of Lyric Embeddings", 'y':0.9, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top' }) fig2.data = [fig2.data[1], fig2.data[0]] return fig2 def recommend(artist, song_title, embedding_importance, topk=5): feature_importance = 1 - embedding_importance song_id = info_df[(info_df.artist_name == artist) & (info_df.song_title == song_title)]['song_id'].values[0] feature_sim = feature_similarity(song_id) embedding_sim = embedding_similarity(song_id) result = embedding_sim.merge(feature_sim, how='left',on='song_id').dropna() result['cosine_similarity'] = (result['cosine_similarity'] - result['cosine_similarity'].min())/ \ (result['cosine_similarity'].max() - result['cosine_similarity'].min()) result['feature_similarity'] = (result['feature_similarity'] - result['feature_similarity'].min())/ \ (result['feature_similarity'].max() - result['feature_similarity'].min()) result['score'] = embedding_importance*result.cosine_similarity + feature_importance*result.feature_similarity exclude_phrases = [r'clean', 'interlude', 'acoustic', r'mix', 'intro', r'original', 'version',\ 'edited', 'extended'] result = result[~result.song_id.isin(info_df[info_df.song_title.str.lower().str.contains('|'.join(exclude_phrases))].song_id)] body='
'.join([decode(x) for x in result.sort_values('score', ascending=False).head(topk).song_id.to_list()]) fig = plot(artist, song_title) return f'

Recommendations


{body}

', fig out = umap.UMAP(n_neighbors=30, min_dist=0.2).fit_transform(embedding_df.iloc[:,:-1]) plot_df = pd.DataFrame({'x':out[:,0],'y':out[:,1],'id':embedding_df.id, 'size':0.1}) plot_df['x'] = ((plot_df['x'] - plot_df['x'].mean())/plot_df['x'].std()) plot_df['y'] = ((plot_df['y'] - plot_df['y'].mean())/plot_df['y'].std()) plot_df = plot_df.merge(info_df, left_on='id', right_on='song_id') plot_df = plot_df[(plot_df.x.abs()<4) & (plot_df.y.abs()<4)] fig = px.scatter(plot_df, 'x', 'y', template='simple_white', hover_data=['artist_name', 'song_title'] ).update_traces(marker_size=1.5, opacity=0.7, ) fig.update_xaxes(visible=False) fig.update_yaxes(visible=False).update_layout(height = 800, width =1500, title = { 'text': "UMAP Projection of Lyric Embeddings", 'y':0.9, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top' }) app = gr.Blocks() with app: gr.Markdown("# Hip Hop gRadio - A Lyric Based Recommender") gr.Markdown("""### About this space The goal of this space is to provide recommendations for hip-hop/rap songs strictly by utilizing lyrics. The recommendations are a combination of ranked similarity scores. We calculate euclidean distances between our engineered feature vectors for each song, as well as a cosine distance between document embeddings of the lyrics themselves. A weighted average of these two results in our final similarity score that we use for recommendation. (feature importance = (1 - embedding importance)) Additionally, we provide a 2-D projection of all document embeddings below. After entering a song of your choice, you will see it as a red dot, allowing you to explore both near and far. This projection reduces 384-dimensional embeddings down to 2-d, allowing visualization. This is done using Uniform Manifold Approximation and Projection [(UMAP)](https://umap-learn.readthedocs.io/en/latest/), a very interesting approach to dimensionalty reduction, I encourage you to look into it if you are interested! ([paper](https://arxiv.org/abs/1802.03426)) The engineered features used are the following: song duration, number of lines, syllables per line, variance in syllables per line, total unique tokens, lexical diversity (measure of repitition), sentiment (using nltk VADER), tokens per second, and syllables per second. **Model used for embedding**: [all-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2)
**Lyrics**: from [genius](https://genius.com/) """) with gr.Row(): with gr.Column(): artist = gr.Dropdown(choices = list(info_df.artist_name.unique()), value = 'Kanye West', label='Artist') song = gr.Dropdown(choices = list(info_df.loc[info_df.artist_name=='Kanye West','song_title']), label = 'Song Title') slider = gr.Slider(0,1,value=0.5, label='Embedding Importance') but = gr.Button() with gr.Column(): t = gr.Markdown('

Recomendations

') with gr.Row(): p = gr.Plot(fig) def artist_songs(artist): return gr.components.Dropdown.update(choices=info_df[info_df.artist_name == artist]['song_title'].to_list()) artist.change(artist_songs, artist, outputs=song) but.click(recommend, inputs=[artist, song,slider], outputs=[t, p]) app.launch()