inie2003 commited on
Commit
81bf6cd
1 Parent(s): 1a748d6

Updated load hugging face dataset function

Browse files
Files changed (1) hide show
  1. helper.py +14 -11
helper.py CHANGED
@@ -47,20 +47,23 @@ def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
47
 
48
  def load_hf_datasets(dataset_name):
49
  """
50
- Load Datasets from Hugging Face as DF
51
- ---------------------------------------
52
  dataset_name: str - name of dataset on Hugging Face
53
- ---------------------------------------
54
-
55
- RETURNS: dataset as pandas dataframe
56
  """
57
  dataset = load_dataset(f"quasara-io/{dataset_name}")
58
- # Access only the 'Main' split
59
- main_dataset = dataset['Main']
60
- # Convert to Pandas DataFrame
61
- df = main_dataset.to_pandas()
62
- return df
63
-
 
 
 
 
64
  def get_image_vectors(df):
65
  # Get the image vectors from the dataframe
66
  image_vectors = np.vstack(df['Vector'].to_numpy())
 
47
 
48
  def load_hf_datasets(dataset_name):
49
  """
50
+ Load all splits containing 'Main' from a Hugging Face dataset as a DataFrame
51
+ ---------------------------------------------------------------------------
52
  dataset_name: str - name of dataset on Hugging Face
53
+ ---------------------------------------------------------------------------
54
+ RETURNS: concatenated dataset as a pandas DataFrame
 
55
  """
56
  dataset = load_dataset(f"quasara-io/{dataset_name}")
57
+
58
+ # Filter splits that contain the word 'Main'
59
+ main_splits = [split for split in dataset if 'Main' in split]
60
+
61
+ # Load and concatenate all splits containing 'Main' into a single DataFrame
62
+ df_list = [dataset[split].to_pandas() for split in main_splits]
63
+ combined_df = pd.concat(df_list, ignore_index=True)
64
+
65
+ return combined_df
66
+
67
  def get_image_vectors(df):
68
  # Get the image vectors from the dataframe
69
  image_vectors = np.vstack(df['Vector'].to_numpy())