Updated load hugging face dataset function
Browse files
helper.py
CHANGED
@@ -47,20 +47,23 @@ def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
|
|
47 |
|
48 |
def load_hf_datasets(dataset_name):
|
49 |
"""
|
50 |
-
Load
|
51 |
-
|
52 |
dataset_name: str - name of dataset on Hugging Face
|
53 |
-
|
54 |
-
|
55 |
-
RETURNS: dataset as pandas dataframe
|
56 |
"""
|
57 |
dataset = load_dataset(f"quasara-io/{dataset_name}")
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
def get_image_vectors(df):
|
65 |
# Get the image vectors from the dataframe
|
66 |
image_vectors = np.vstack(df['Vector'].to_numpy())
|
|
|
47 |
|
48 |
def load_hf_datasets(dataset_name):
|
49 |
"""
|
50 |
+
Load all splits containing 'Main' from a Hugging Face dataset as a DataFrame
|
51 |
+
---------------------------------------------------------------------------
|
52 |
dataset_name: str - name of dataset on Hugging Face
|
53 |
+
---------------------------------------------------------------------------
|
54 |
+
RETURNS: concatenated dataset as a pandas DataFrame
|
|
|
55 |
"""
|
56 |
dataset = load_dataset(f"quasara-io/{dataset_name}")
|
57 |
+
|
58 |
+
# Filter splits that contain the word 'Main'
|
59 |
+
main_splits = [split for split in dataset if 'Main' in split]
|
60 |
+
|
61 |
+
# Load and concatenate all splits containing 'Main' into a single DataFrame
|
62 |
+
df_list = [dataset[split].to_pandas() for split in main_splits]
|
63 |
+
combined_df = pd.concat(df_list, ignore_index=True)
|
64 |
+
|
65 |
+
return combined_df
|
66 |
+
|
67 |
def get_image_vectors(df):
|
68 |
# Get the image vectors from the dataframe
|
69 |
image_vectors = np.vstack(df['Vector'].to_numpy())
|