MorenoLaQuatra commited on
Commit
18d579a
2 Parent(s): f0cd753 b8d9e31

Merge branch 'main' of https://huggingface.co/spaces/ALM/CALM into main

Browse files
Files changed (2) hide show
  1. app.py +19 -13
  2. requirements.txt +1 -0
app.py CHANGED
@@ -188,7 +188,7 @@ class CLIPDemo:
188
  def compute_image_embeddings(self, image_paths: list):
189
  self.image_paths = image_paths
190
  dataloader = DataLoader(VisionDataset(
191
- image_paths=image_paths), batch_size=self.batch_size, num_workers=8)
192
  embeddings = []
193
  with torch.no_grad():
194
 
@@ -249,17 +249,19 @@ class CLIPDemo:
249
  def draw_text(
250
  key,
251
  plot=False,
 
252
  ):
253
 
 
254
  image = Image.open("data/logo.png")
255
  st.image(image, use_column_width="always")
256
 
257
  if 'model' not in st.session_state:
258
  #with st.spinner('We are orginizing your traks...'):
259
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
260
- vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
261
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
262
- model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
263
  model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:1000])
264
  st.session_state["model"] = model
265
 
@@ -302,18 +304,19 @@ def draw_text(
302
  def draw_audio(
303
  key,
304
  plot=False,
 
305
  ):
306
 
307
  image = Image.open("data/logo.png")
308
  st.image(image, use_column_width="always")
309
-
310
  if 'model' not in st.session_state:
311
  #with st.spinner('We are orginizing your traks...'):
312
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
313
- vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
314
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
315
- model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
316
- model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH+"/*.jpeg")[:5000])
317
  st.session_state["model"] = model
318
  #st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
319
  #st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
@@ -369,6 +372,7 @@ def draw_audio(
369
  def draw_camera(
370
  key,
371
  plot=False,
 
372
  ):
373
 
374
  image = Image.open("data/logo.png")
@@ -377,10 +381,10 @@ def draw_camera(
377
  if 'model' not in st.session_state:
378
  #with st.spinner('We are orginizing your traks...'):
379
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
380
- vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True)
381
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
382
- model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
383
- model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:5000])
384
  st.session_state["model"] = model
385
  #st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
386
  #st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
@@ -427,15 +431,17 @@ def draw_camera(
427
  selected = streamlit_menu(example=3)
428
  df = pd.read_csv('full_metadata.csv', index_col=False)
429
 
 
 
430
  if selected == "Text":
431
  # st.title(f"You have selected {selected}")
432
- draw_text("text", plot=True)
433
  if selected == "Audio":
434
  # st.title(f"You have selected {selected}")
435
- draw_audio("audio", plot=True)
436
  if selected == "Camera":
437
  # st.title(f"You have selected {selected}")
438
- #draw_camera("camera", plot=True)
439
  pass
440
 
441
  # with st.sidebar:
 
188
  def compute_image_embeddings(self, image_paths: list):
189
  self.image_paths = image_paths
190
  dataloader = DataLoader(VisionDataset(
191
+ image_paths=image_paths), batch_size=self.batch_size)
192
  embeddings = []
193
  with torch.no_grad():
194
 
 
249
  def draw_text(
250
  key,
251
  plot=False,
252
+ device=None,
253
  ):
254
 
255
+
256
  image = Image.open("data/logo.png")
257
  st.image(image, use_column_width="always")
258
 
259
  if 'model' not in st.session_state:
260
  #with st.spinner('We are orginizing your traks...'):
261
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
262
+ vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True).to(device)
263
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
264
+ model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer, device=device)
265
  model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:1000])
266
  st.session_state["model"] = model
267
 
 
304
  def draw_audio(
305
  key,
306
  plot=False,
307
+ device=None,
308
  ):
309
 
310
  image = Image.open("data/logo.png")
311
  st.image(image, use_column_width="always")
312
+
313
  if 'model' not in st.session_state:
314
  #with st.spinner('We are orginizing your traks...'):
315
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
316
+ vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True).to(device)
317
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
318
+ model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer, device=device)
319
+ model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH+"/*.jpeg")[:1000])
320
  st.session_state["model"] = model
321
  #st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
322
  #st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
 
372
  def draw_camera(
373
  key,
374
  plot=False,
375
+ device=None,
376
  ):
377
 
378
  image = Image.open("data/logo.png")
 
381
  if 'model' not in st.session_state:
382
  #with st.spinner('We are orginizing your traks...'):
383
  text_encoder = AutoModel.from_pretrained(CLIP_TEXT_MODEL_PATH, local_files_only=True)
384
+ vision_encoder = CLIPVisionModel.from_pretrained(CLIP_VISION_MODEL_PATH, local_files_only=True).to(device)
385
  tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
386
+ model = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer, device=device)
387
+ model.compute_image_embeddings(glob.glob(SPECTROGRAMS_PATH + "/*.jpeg")[:1000])
388
  st.session_state["model"] = model
389
  #st.session_state['model'] = CLIPDemo(vision_encoder=vision_encoder, text_encoder=text_encoder, tokenizer=tokenizer)
390
  #st.session_state.model.compute_image_embeddings(glob.glob("/data1/mlaquatra/TSOAI_hack/data/spectrograms/*.jpeg")[:100])
 
431
  selected = streamlit_menu(example=3)
432
  df = pd.read_csv('full_metadata.csv', index_col=False)
433
 
434
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
435
+
436
  if selected == "Text":
437
  # st.title(f"You have selected {selected}")
438
+ draw_text("text", plot=True, device=device)
439
  if selected == "Audio":
440
  # st.title(f"You have selected {selected}")
441
+ draw_audio("audio", plot=True, device=device)
442
  if selected == "Camera":
443
  # st.title(f"You have selected {selected}")
444
+ #draw_camera("camera", plot=True, device=device)
445
  pass
446
 
447
  # with st.sidebar:
requirements.txt CHANGED
@@ -7,6 +7,7 @@ bokeh
7
  streamlit_bokeh_events
8
  streamlit-webcam-example
9
  torch
 
10
  numpy
11
  pandas
12
  tqdm
 
7
  streamlit_bokeh_events
8
  streamlit-webcam-example
9
  torch
10
+ torchvision
11
  numpy
12
  pandas
13
  tqdm