Spaces:
Running
Running
FoodDesert
commited on
Commit
•
a077145
1
Parent(s):
075d09e
Upload 2 files
Browse files- app.py +50 -7
- requirements.txt +1 -0
app.py
CHANGED
@@ -9,6 +9,9 @@ import re
|
|
9 |
import random
|
10 |
import compress_fasttext
|
11 |
from collections import OrderedDict
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
faq_content="""
|
@@ -52,6 +55,34 @@ You can read more about TF-IDF on its [Wikipedia page](https://en.wikipedia.org/
|
|
52 |
"""
|
53 |
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# Load the model and data once at startup
|
56 |
with h5py.File('complete_artist_data.hdf5', 'r') as f:
|
57 |
# Deserialize the vectorizer
|
@@ -99,11 +130,16 @@ def find_similar_tags(test_tags):
|
|
99 |
# Find similar tags and prepare data for dataframe.
|
100 |
results_data = []
|
101 |
for tag in test_tags:
|
102 |
-
|
|
|
103 |
result, seen = [], set()
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
107 |
else:
|
108 |
for item in similar_words:
|
109 |
similar_word, similarity = item
|
@@ -127,13 +163,20 @@ def find_similar_tags(test_tags):
|
|
127 |
results_data.append(["", word, sim])
|
128 |
results_data.append(["", "", ""]) # Adds a blank line after each group of tags
|
129 |
|
|
|
|
|
130 |
|
131 |
return results_data # Return list of lists for Dataframe
|
132 |
|
133 |
def find_similar_artists(new_tags_string, top_n):
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
X_new_image = vectorizer.transform([','.join(new_image_tags)])
|
139 |
similarities = cosine_similarity(X_new_image, X_artist)[0]
|
|
|
9 |
import random
|
10 |
import compress_fasttext
|
11 |
from collections import OrderedDict
|
12 |
+
from lark import Lark
|
13 |
+
from lark import Token
|
14 |
+
|
15 |
|
16 |
|
17 |
faq_content="""
|
|
|
55 |
"""
|
56 |
|
57 |
|
58 |
+
grammar=r"""
|
59 |
+
!start: (prompt | /[][():]/+)*
|
60 |
+
prompt: (emphasized | plain | comma | WHITESPACE)*
|
61 |
+
!emphasized: "(" prompt ")"
|
62 |
+
| "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
|
63 |
+
comma: ","
|
64 |
+
WHITESPACE: /\s+/
|
65 |
+
plain: /([^,\\\[\]():|]|\\.)+/
|
66 |
+
%import common.SIGNED_NUMBER -> NUMBER
|
67 |
+
"""
|
68 |
+
# Initialize the parser
|
69 |
+
parser = Lark(grammar, start='start')
|
70 |
+
|
71 |
+
|
72 |
+
# Function to extract tags
|
73 |
+
def extract_tags(tree):
|
74 |
+
tags = []
|
75 |
+
def _traverse(node):
|
76 |
+
if isinstance(node, Token) and node.type == '__ANON_1':
|
77 |
+
tags.append(node.value.strip())
|
78 |
+
elif not isinstance(node, Token):
|
79 |
+
for child in node.children:
|
80 |
+
_traverse(child)
|
81 |
+
|
82 |
+
_traverse(tree)
|
83 |
+
return tags
|
84 |
+
|
85 |
+
|
86 |
# Load the model and data once at startup
|
87 |
with h5py.File('complete_artist_data.hdf5', 'r') as f:
|
88 |
# Deserialize the vectorizer
|
|
|
130 |
# Find similar tags and prepare data for dataframe.
|
131 |
results_data = []
|
132 |
for tag in test_tags:
|
133 |
+
modified_tag_for_search = tag.replace(' ','_')
|
134 |
+
similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search)
|
135 |
result, seen = [], set()
|
136 |
+
|
137 |
+
if modified_tag_for_search in find_similar_tags.tag2aliases:
|
138 |
+
if tag in find_similar_tags.tag2aliases and "_" in tag: #Implicitly tell the user that they should get rid of the underscore
|
139 |
+
result.append(modified_tag_for_search.replace('_',' '), 1)
|
140 |
+
seen.add(tag)
|
141 |
+
else: #The user correctly did not put underscores in their tag
|
142 |
+
continue
|
143 |
else:
|
144 |
for item in similar_words:
|
145 |
similar_word, similarity = item
|
|
|
163 |
results_data.append(["", word, sim])
|
164 |
results_data.append(["", "", ""]) # Adds a blank line after each group of tags
|
165 |
|
166 |
+
if not results_data:
|
167 |
+
results_data.append(["No Unknown Tags Found", "", ""])
|
168 |
|
169 |
return results_data # Return list of lists for Dataframe
|
170 |
|
171 |
def find_similar_artists(new_tags_string, top_n):
|
172 |
+
# Parse the prompt
|
173 |
+
parsed = parser.parse(new_tags_string)
|
174 |
+
# Extract tags from the parsed tree
|
175 |
+
new_image_tags = extract_tags(parsed)
|
176 |
+
new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags]
|
177 |
+
|
178 |
+
###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))
|
179 |
+
unseen_tags_data = find_similar_tags(new_image_tags)
|
180 |
|
181 |
X_new_image = vectorizer.transform([','.join(new_image_tags)])
|
182 |
similarities = cosine_similarity(X_new_image, X_artist)[0]
|
requirements.txt
CHANGED
@@ -4,3 +4,4 @@ scikit-learn==1.2.2
|
|
4 |
h5py==3.8.0
|
5 |
joblib==1.2.0
|
6 |
compress-fasttext
|
|
|
|
4 |
h5py==3.8.0
|
5 |
joblib==1.2.0
|
6 |
compress-fasttext
|
7 |
+
lark-parser
|