Spaces:
Sleeping
Sleeping
Try to fix cuml error
Browse files
app.py
CHANGED
@@ -23,13 +23,21 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
23 |
from bertopic import BERTopic
|
24 |
from bertopic.representation import KeyBERTInspired
|
25 |
from bertopic.representation import TextGeneration
|
26 |
-
from cuml.manifold import UMAP
|
27 |
-
from cuml.cluster import HDBSCAN
|
28 |
from huggingface_hub import HfApi
|
29 |
from sklearn.feature_extraction.text import CountVectorizer
|
30 |
from sentence_transformers import SentenceTransformer
|
31 |
from prompts import REPRESENTATION_PROMPT
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"""
|
34 |
TODOs:
|
35 |
- Improve representation layer (Try with llamacpp or TextGeneration)
|
@@ -127,11 +135,6 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
|
|
127 |
return df[column].tolist()
|
128 |
|
129 |
|
130 |
-
@spaces.GPU
|
131 |
-
def calculate_embeddings(docs):
|
132 |
-
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
133 |
-
|
134 |
-
|
135 |
def calculate_n_neighbors_and_components(n_rows):
|
136 |
n_neighbors = min(max(n_rows // 20, 15), 100)
|
137 |
n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
|
|
|
23 |
from bertopic import BERTopic
|
24 |
from bertopic.representation import KeyBERTInspired
|
25 |
from bertopic.representation import TextGeneration
|
|
|
|
|
26 |
from huggingface_hub import HfApi
|
27 |
from sklearn.feature_extraction.text import CountVectorizer
|
28 |
from sentence_transformers import SentenceTransformer
|
29 |
from prompts import REPRESENTATION_PROMPT
|
30 |
|
31 |
+
|
32 |
+
@spaces.GPU
|
33 |
+
def calculate_embeddings(docs):
|
34 |
+
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
35 |
+
|
36 |
+
|
37 |
+
from cuml.manifold import UMAP
|
38 |
+
from cuml.cluster import HDBSCAN
|
39 |
+
|
40 |
+
|
41 |
"""
|
42 |
TODOs:
|
43 |
- Improve representation layer (Try with llamacpp or TextGeneration)
|
|
|
135 |
return df[column].tolist()
|
136 |
|
137 |
|
|
|
|
|
|
|
|
|
|
|
138 |
def calculate_n_neighbors_and_components(n_rows):
|
139 |
n_neighbors = min(max(n_rows // 20, 15), 100)
|
140 |
n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
|