FoodDesert commited on
Commit
a077145
1 Parent(s): 075d09e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +50 -7
  2. requirements.txt +1 -0
app.py CHANGED
@@ -9,6 +9,9 @@ import re
9
  import random
10
  import compress_fasttext
11
  from collections import OrderedDict
 
 
 
12
 
13
 
14
  faq_content="""
@@ -52,6 +55,34 @@ You can read more about TF-IDF on its [Wikipedia page](https://en.wikipedia.org/
52
  """
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # Load the model and data once at startup
56
  with h5py.File('complete_artist_data.hdf5', 'r') as f:
57
  # Deserialize the vectorizer
@@ -99,11 +130,16 @@ def find_similar_tags(test_tags):
99
  # Find similar tags and prepare data for dataframe.
100
  results_data = []
101
  for tag in test_tags:
102
- similar_words = find_similar_tags.fasttext_small_model.most_similar(tag)
 
103
  result, seen = [], set()
104
- if tag in find_similar_tags.tag2aliases:
105
- result.append((tag, 1))
106
- seen.add(tag)
 
 
 
 
107
  else:
108
  for item in similar_words:
109
  similar_word, similarity = item
@@ -127,13 +163,20 @@ def find_similar_tags(test_tags):
127
  results_data.append(["", word, sim])
128
  results_data.append(["", "", ""]) # Adds a blank line after each group of tags
129
 
 
 
130
 
131
  return results_data # Return list of lists for Dataframe
132
 
133
  def find_similar_artists(new_tags_string, top_n):
134
- new_image_tags = [tag.replace('_', ' ').strip() for tag in new_tags_string.split(",")]
135
- unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))
136
- unseen_tags_data = find_similar_tags(unseen_tags) if unseen_tags else [["No unseen tags", "", ""]]
 
 
 
 
 
137
 
138
  X_new_image = vectorizer.transform([','.join(new_image_tags)])
139
  similarities = cosine_similarity(X_new_image, X_artist)[0]
 
9
  import random
10
  import compress_fasttext
11
  from collections import OrderedDict
12
+ from lark import Lark
13
+ from lark import Token
14
+
15
 
16
 
17
  faq_content="""
 
55
  """
56
 
57
 
58
+ grammar=r"""
59
+ !start: (prompt | /[][():]/+)*
60
+ prompt: (emphasized | plain | comma | WHITESPACE)*
61
+ !emphasized: "(" prompt ")"
62
+ | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
63
+ comma: ","
64
+ WHITESPACE: /\s+/
65
+ plain: /([^,\\\[\]():|]|\\.)+/
66
+ %import common.SIGNED_NUMBER -> NUMBER
67
+ """
68
+ # Initialize the parser
69
+ parser = Lark(grammar, start='start')
70
+
71
+
72
+ # Function to extract tags
73
+ def extract_tags(tree):
74
+ tags = []
75
+ def _traverse(node):
76
+ if isinstance(node, Token) and node.type == '__ANON_1':
77
+ tags.append(node.value.strip())
78
+ elif not isinstance(node, Token):
79
+ for child in node.children:
80
+ _traverse(child)
81
+
82
+ _traverse(tree)
83
+ return tags
84
+
85
+
86
  # Load the model and data once at startup
87
  with h5py.File('complete_artist_data.hdf5', 'r') as f:
88
  # Deserialize the vectorizer
 
130
  # Find similar tags and prepare data for dataframe.
131
  results_data = []
132
  for tag in test_tags:
133
+ modified_tag_for_search = tag.replace(' ','_')
134
+ similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search)
135
  result, seen = [], set()
136
+
137
+ if modified_tag_for_search in find_similar_tags.tag2aliases:
138
+ if tag in find_similar_tags.tag2aliases and "_" in tag: #Implicitly tell the user that they should get rid of the underscore
139
+ result.append(modified_tag_for_search.replace('_',' '), 1)
140
+ seen.add(tag)
141
+ else: #The user correctly did not put underscores in their tag
142
+ continue
143
  else:
144
  for item in similar_words:
145
  similar_word, similarity = item
 
163
  results_data.append(["", word, sim])
164
  results_data.append(["", "", ""]) # Adds a blank line after each group of tags
165
 
166
+ if not results_data:
167
+ results_data.append(["No Unknown Tags Found", "", ""])
168
 
169
  return results_data # Return list of lists for Dataframe
170
 
171
  def find_similar_artists(new_tags_string, top_n):
172
+ # Parse the prompt
173
+ parsed = parser.parse(new_tags_string)
174
+ # Extract tags from the parsed tree
175
+ new_image_tags = extract_tags(parsed)
176
+ new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags]
177
+
178
+ ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))
179
+ unseen_tags_data = find_similar_tags(new_image_tags)
180
 
181
  X_new_image = vectorizer.transform([','.join(new_image_tags)])
182
  similarities = cosine_similarity(X_new_image, X_artist)[0]
requirements.txt CHANGED
@@ -4,3 +4,4 @@ scikit-learn==1.2.2
4
  h5py==3.8.0
5
  joblib==1.2.0
6
  compress-fasttext
 
 
4
  h5py==3.8.0
5
  joblib==1.2.0
6
  compress-fasttext
7
+ lark-parser