Spaces:

FoodDesert
/

Prompt_Squirrel

Running

App Files Files Community

FoodDesert commited on Apr 19, 2024

Commit

b4bf2a9

verified ·

1 Parent(s): 503dc78

Upload 2 files

Browse files

Files changed (2) hide show

app.py +81 -26
tf_idf_files_418.joblib +3 -0

app.py CHANGED Viewed

@@ -21,6 +21,7 @@ import os
 import glob
 import itertools
 from itertools import islice
@@ -159,6 +160,26 @@ def remove_special_tags(original_string):
     removed_tags = [tag for tag in tags if tag in special_tags]
     return ", ".join(remaining_tags), removed_tags
 # Load the model and data once at startup
 with h5py.File('complete_artist_data.hdf5', 'r') as f:
@@ -204,6 +225,24 @@ with open("word_rating_probabilities.csv", 'r', newline='', encoding='utf-8') as
             nsfw_tags.add(word)
 sample_images_directory_path = 'sampleimages'
 def generate_artist_image_tuples(top_artists, image_directory):
     json_files = glob.glob(f'{image_directory}/*.json')
@@ -404,6 +443,7 @@ def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_row_loaded):
     # Return the vector as a 2D array for compatibility with SVD transform
     return pseudo_vector.reshape(1, -1)
 def get_top_indices(reduced_pseudo_vector, reduced_matrix):
     # Compute cosine similarities
@@ -415,35 +455,42 @@ def get_top_indices(reduced_pseudo_vector, reduced_matrix):
     # Return the top N indices
     return sorted_indices
 def get_tfidf_reduced_similar_tags(pseudo_doc_terms, allow_nsfw_tags):
-    # Check and load components if not already loaded
-    if not hasattr(get_tfidf_reduced_similar_tags, "components"):
-        get_tfidf_reduced_similar_tags.components = joblib.load('tfidfreducedfiles.joblib')
-    # Access components
-    components = get_tfidf_reduced_similar_tags.components
-    idf_loaded = components['idf']
-    tag_to_row_loaded = components['tag_to_row']
-    reduced_matrix_loaded = components['reduced_matrix']
-    svd_loaded = components['svd_model']
-    # Remaining part of the function
-    pseudo_vector = construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_row_loaded)
-    reduced_pseudo_vector = svd_loaded.transform(pseudo_vector)
-    # Compute cosine similarities
-    similarities = cosine_similarity(reduced_pseudo_vector, reduced_matrix_loaded).flatten()
-    # Get top N indices based on similarities
-    top_indices_reduced = get_top_indices(reduced_pseudo_vector, reduced_matrix_loaded)
-    # Create the initial tag_similarity_dict
-    tag_similarity_dict = {list(tag_to_row_loaded.keys())[i]: similarities[i] for i in top_indices_reduced}
     if not allow_nsfw_tags:
-        tag_similarity_dict = {tag: similarity for tag, similarity in tag_similarity_dict.items() if tag.replace(' ', '_') not in nsfw_tags}
     sorted_tag_similarity_dict = OrderedDict(sorted(tag_similarity_dict.items(), key=lambda x: x[1], reverse=True))
-    return sorted_tag_similarity_dict
 def create_html_placeholder(title="", content="", placeholder_height=400, placeholder_width="100%"):
@@ -555,6 +602,7 @@ def build_tag_offsets_dicts(new_image_tags_with_positions):
         # Modify the tag
         modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
         artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
         # Calculate the end position based on the original tag length
         end_pos = start_pos + len(tag_text)
         # Append the structured data for each tag
@@ -564,6 +612,7 @@ def build_tag_offsets_dicts(new_image_tags_with_positions):
             "end_pos": end_pos,
             "modified_tag": modified_tag,
             "artist_matrix_tag": artist_matrix_tag,
             "node_type": nodetype
         })
     return tag_data
@@ -619,8 +668,13 @@ def find_similar_artists(original_tags_string, top_n, similarity_weight, allow_n
         suggested_tags_html_content = "<div class=\"scrollable-content\" style='display: inline-block; margin: 20px; text-align: center;'>"
         suggested_tags_html_content += "<h1>Suggested Tags</h1>"  # Heading for the table
-        suggested_tags = get_tfidf_reduced_similar_tags([item["artist_matrix_tag"] for item in tag_data], allow_nsfw_tags)
-        suggested_tags_filtered = OrderedDict((k, v) for k, v in suggested_tags.items() if k not in [entry["original_tag"] for entry in tag_data])
         topnsuggestions = list(islice(suggested_tags_filtered.items(), 100))
         suggested_tags_html_content += create_html_tables_for_tags("Suggested Tag", topnsuggestions, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
@@ -658,8 +712,9 @@ with gr.Blocks(css=css) as app:
                 #gr.Image(label=" ", value=image_path, height=155, width=140)
                 #gr.HTML('<div style="text-align: center;"><img src={image_path} alt="Cute Mascot" style="max-height: 100px; background: transparent;"></div><br>')
                 #gr.HTML("<br>" * 2)  # Adjust the number of line breaks ("<br>") as needed to push the button down
-                image_path = os.path.join('mascotimages', "transparentsquirrel.png")
-                with Image.open(image_path) as img:
                     gr.Image(value=img,show_label=False, show_download_button=False, show_share_button=False, height=200)
                 submit_button = gr.Button(variant="primary")
     with gr.Row():

 import glob
 import itertools
 from itertools import islice
+from pathlib import Path
     removed_tags = [tag for tag in tags if tag in special_tags]
     return ", ".join(remaining_tags), removed_tags
+# Define a function to load all necessary components
+def load_model_components(file_path):
+    # Ensure the file path is a Path object for robust path handling
+    file_path = Path(file_path)
+    # Check if the file exists
+    if not file_path.is_file():
+        raise FileNotFoundError(f"The specified joblib file was not found: {file_path}")
+    # Load all the model components from the joblib file
+    model_components = joblib.load(file_path)
+    # Create a reverse mapping from row index to tag
+    if 'tag_to_row_index' in model_components:
+        model_components['row_to_tag'] = {idx: tag for tag, idx in model_components['tag_to_row_index'].items()}
+    return model_components
+# Load all components at the start
+tf_idf_components = load_model_components('tf_idf_files_418.joblib')
 # Load the model and data once at startup
 with h5py.File('complete_artist_data.hdf5', 'r') as f:
             nsfw_tags.add(word)
+# Read the set of valid artists into memory.
+artist_set = set()
+with open("fluffyrock_3m.csv", 'r', newline='', encoding='utf-8') as csvfile:
+    """
+    Load artist names from a CSV file and store them in the global set.
+    Artist tags start with 'by_' and the prefix will be removed.
+    """
+    reader = csv.reader(csvfile)
+    for row in reader:
+        tag_name = row[0]  # Assuming the first column contains the tag names
+        if tag_name.startswith('by_'):
+            # Strip 'by_' from the start of the tag name and add to the set
+            artist_name = tag_name[3:]  # Remove the first three characters 'by_'
+            artist_set.add(artist_name)
+def is_artist(name):
+    return name in artist_set
 sample_images_directory_path = 'sampleimages'
 def generate_artist_image_tuples(top_artists, image_directory):
     json_files = glob.glob(f'{image_directory}/*.json')
     # Return the vector as a 2D array for compatibility with SVD transform
     return pseudo_vector.reshape(1, -1)
 def get_top_indices(reduced_pseudo_vector, reduced_matrix):
     # Compute cosine similarities
     # Return the top N indices
     return sorted_indices
 def get_tfidf_reduced_similar_tags(pseudo_doc_terms, allow_nsfw_tags):
+    idf = tf_idf_components['idf']
+    term_to_column_index = tf_idf_components['tag_to_column_index']
+    row_to_tag = tf_idf_components['row_to_tag']
+    reduced_matrix = tf_idf_components['reduced_matrix']
+    svd = tf_idf_components['svd_model']
+    # Construct the TF-IDF vector
+    pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)
+    # Reduce the dimensionality of the pseudo-document vector for the reduced matrix
+    reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)
+    # Compute cosine similarities in the reduced space
+    cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()
+    # Sort the indices by descending cosine similarity
+    top_indices_reduced = np.argsort(cosine_similarities_reduced)
+    # Map indices to tags with their similarities
+    tag_similarity_dict = {row_to_tag[i]: cosine_similarities_reduced[i] for i in top_indices_reduced if i in row_to_tag}
     if not allow_nsfw_tags:
+        tag_similarity_dict = {tag: sim for tag, sim in tag_similarity_dict.items() if tag not in nsfw_tags}
+    tag_similarity_dict = {"by " + tag if is_artist(tag) else tag: sim for tag, sim in tag_similarity_dict.items()}
+    # Sort and transform tag names
     sorted_tag_similarity_dict = OrderedDict(sorted(tag_similarity_dict.items(), key=lambda x: x[1], reverse=True))
+    transformed_sorted_tag_similarity_dict = OrderedDict(
+        (key.replace('_', ' ').replace('(', '\\(').replace(')', '\\)'), value)
+        for key, value in sorted_tag_similarity_dict.items()
+    )
+    return transformed_sorted_tag_similarity_dict
 def create_html_placeholder(title="", content="", placeholder_height=400, placeholder_width="100%"):
         # Modify the tag
         modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
         artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
+        tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
         # Calculate the end position based on the original tag length
         end_pos = start_pos + len(tag_text)
         # Append the structured data for each tag
             "end_pos": end_pos,
             "modified_tag": modified_tag,
             "artist_matrix_tag": artist_matrix_tag,
+            "tf_idf_matrix_tag": tf_idf_matrix_tag,
             "node_type": nodetype
         })
     return tag_data
         suggested_tags_html_content = "<div class=\"scrollable-content\" style='display: inline-block; margin: 20px; text-align: center;'>"
         suggested_tags_html_content += "<h1>Suggested Tags</h1>"  # Heading for the table
+        suggested_tags = get_tfidf_reduced_similar_tags([item["tf_idf_matrix_tag"] for item in tag_data], allow_nsfw_tags)
+        # Create a set of tags that should be filtered out
+        filter_tags = {entry["original_tag"].strip() for entry in tag_data}
+        # Use this set to filter suggested_tags
+        suggested_tags_filtered = OrderedDict((k, v) for k, v in suggested_tags.items() if k not in filter_tags)
         topnsuggestions = list(islice(suggested_tags_filtered.items(), 100))
         suggested_tags_html_content += create_html_tables_for_tags("Suggested Tag", topnsuggestions, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
                 #gr.Image(label=" ", value=image_path, height=155, width=140)
                 #gr.HTML('<div style="text-align: center;"><img src={image_path} alt="Cute Mascot" style="max-height: 100px; background: transparent;"></div><br>')
                 #gr.HTML("<br>" * 2)  # Adjust the number of line breaks ("<br>") as needed to push the button down
+                #image_path = os.path.join('mascotimages', "transparentsquirrel.png")
+                random_image_path = os.path.join('mascotimages', random.choice([f for f in os.listdir('mascotimages') if os.path.isfile(os.path.join('mascotimages', f))]))
+                with Image.open(random_image_path) as img:
                     gr.Image(value=img,show_label=False, show_download_button=False, show_share_button=False, height=200)
                 submit_button = gr.Button(variant="primary")
     with gr.Row():

tf_idf_files_418.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1072321ea307c7b1e9518bb02426bede8d181ce17565721094dee674a3712e8c
+size 115989585