Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on Oct 3, 2024

Commit

9b9b3ce

1 Parent(s): 457b45f

Trying to reduce GPU load?

Browse files

Files changed (1) hide show

app.py +26 -8

app.py CHANGED Viewed

@@ -54,14 +54,18 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
     quantization_config=bnb_config,
     device_map="auto",
 )
 generator = pipeline(
     model=model,
     tokenizer=tokenizer,
     task="text-generation",
     temperature=0.1,
-    max_new_tokens=500,
     repetition_penalty=1.1,
 )
@@ -71,19 +75,31 @@ representation_model = {
     "Llama2": llama2,
 }
 umap_model = UMAP(
-    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
 )
 hdbscan_model = HDBSCAN(
-    min_cluster_size=15,
     metric="euclidean",
     cluster_selection_method="eom",
     prediction_data=True,
 )
 reduce_umap_model = UMAP(
-    n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine", random_state=42
 )
@@ -107,8 +123,9 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
 # @spaces.GPU
 def calculate_embeddings(docs):
-    return sentence_model.encode(docs, show_progress_bar=True, batch_size=100)
 # @spaces.GPU
@@ -124,7 +141,7 @@ def fit_model(base_model, docs, embeddings):
         # Hyperparameters
         top_n_words=10,
         verbose=True,
-        min_topic_size=15,
     )
     logging.debug("Fitting new model")
     new_model.fit(docs, embeddings)
@@ -185,13 +202,14 @@ def generate_topics(dataset, config, split, column, nested_column):
         # )
         topic_plot = base_model.visualize_barchart()
-        logging.info(f"Topics: {llama2_labels}")
         yield topics_info, topic_plot
         offset += chunk_size
     logging.info("Finished processing all data")
     return topics_info, topic_plot
@@ -229,7 +247,7 @@ with gr.Blocks() as demo:
                 label="Nested text column name", visible=False
             )
-        generate_button = gr.Button("Generate Notebook", variant="primary")
     gr.Markdown("## Datamap")
     topics_plot = gr.Plot()

     trust_remote_code=True,
     quantization_config=bnb_config,
     device_map="auto",
+    offload_folder="offload",  # Offloading part of the model to CPU to save GPU memory
 )
+# Enable gradient checkpointing for memory efficiency during backprop
+model.gradient_checkpointing_enable()
 generator = pipeline(
     model=model,
     tokenizer=tokenizer,
     task="text-generation",
     temperature=0.1,
+    max_new_tokens=200,  # Reduced max_new_tokens to limit memory consumption
     repetition_penalty=1.1,
 )
     "Llama2": llama2,
 }
+# TODO: It should be proporcional to the number of rows
+# For small datasets (1-200 rows) it worked fine with 2 neighbors
+N_NEIGHBORS = 15
 umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=5,
+    min_dist=0.0,
+    metric="cosine",
+    random_state=42,
 )
 hdbscan_model = HDBSCAN(
+    min_cluster_size=N_NEIGHBORS,
     metric="euclidean",
     cluster_selection_method="eom",
     prediction_data=True,
 )
 reduce_umap_model = UMAP(
+    n_neighbors=N_NEIGHBORS,
+    n_components=2,
+    min_dist=0.0,
+    metric="cosine",
+    random_state=42,
 )
 # @spaces.GPU
+# TODO: Modify batch size to reduce memory consumption during embedding calculation, which value is better?
 def calculate_embeddings(docs):
+    return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
 # @spaces.GPU
         # Hyperparameters
         top_n_words=10,
         verbose=True,
+        min_topic_size=15,  # TODO: Should this value be coherent with N_NEIGHBORS?
     )
     logging.debug("Fitting new model")
     new_model.fit(docs, embeddings)
         # )
         topic_plot = base_model.visualize_barchart()
+        logging.info(f"Topics: {repr_model_topics}")
         yield topics_info, topic_plot
         offset += chunk_size
     logging.info("Finished processing all data")
+    cuda.empty_cache()  # Clear cache at the end of each chunk
     return topics_info, topic_plot
                 label="Nested text column name", visible=False
             )
+        generate_button = gr.Button("Generate Topics", variant="primary")
     gr.Markdown("## Datamap")
     topics_plot = gr.Plot()