Dragunflie-420
commited on
Create datasets
Browse files
datasets
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Tutorial: Using the FMAHiphop Dataset
|
2 |
+
|
3 |
+
# Installation Instructions:
|
4 |
+
# Before running this script, make sure you have the required libraries installed.
|
5 |
+
# You can install them using pip:
|
6 |
+
#
|
7 |
+
# pip install datasets pandas matplotlib
|
8 |
+
#
|
9 |
+
# If you're using a Jupyter notebook, you can run this in a cell:
|
10 |
+
# !pip install datasets pandas matplotlib
|
11 |
+
|
12 |
+
# First, let's import the necessary libraries
|
13 |
+
try:
|
14 |
+
from datasets import load_dataset
|
15 |
+
except ImportError:
|
16 |
+
print("The 'datasets' library is not installed. Please install it using: pip install datasets")
|
17 |
+
exit(1)
|
18 |
+
|
19 |
+
try:
|
20 |
+
import pandas as pd
|
21 |
+
except ImportError:
|
22 |
+
print("The 'pandas' library is not installed. Please install it using: pip install pandas")
|
23 |
+
exit(1)
|
24 |
+
|
25 |
+
try:
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
except ImportError:
|
28 |
+
print("The 'matplotlib' library is not installed. Please install it using: pip install matplotlib")
|
29 |
+
exit(1)
|
30 |
+
|
31 |
+
# Load the dataset
|
32 |
+
try:
|
33 |
+
ds = load_dataset("Nkumar5/FMAHiphop")
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error loading the dataset: {e}")
|
36 |
+
print("Please check your internet connection and ensure the dataset name is correct.")
|
37 |
+
exit(1)
|
38 |
+
|
39 |
+
# Let's explore the dataset structure
|
40 |
+
print("Dataset structure:")
|
41 |
+
print(ds)
|
42 |
+
|
43 |
+
# Look at the features of the training set
|
44 |
+
print("\nFeatures in the training set:")
|
45 |
+
print(ds['train'].features)
|
46 |
+
|
47 |
+
# Get the first example from the training set
|
48 |
+
first_example = ds['train'][0]
|
49 |
+
print("\nFirst example:")
|
50 |
+
print(first_example)
|
51 |
+
|
52 |
+
# Convert the dataset to a pandas DataFrame for easier manipulation
|
53 |
+
df = pd.DataFrame(ds['train'])
|
54 |
+
|
55 |
+
# Basic statistics of the dataset
|
56 |
+
print("\nDataset statistics:")
|
57 |
+
print(df.describe())
|
58 |
+
|
59 |
+
# If there's an 'artist' column, let's see the most common artists
|
60 |
+
if 'artist' in df.columns:
|
61 |
+
print("\nTop 10 artists by track count:")
|
62 |
+
print(df['artist'].value_counts().head(10))
|
63 |
+
else:
|
64 |
+
print("\nNo 'artist' column found in the dataset.")
|
65 |
+
|
66 |
+
# If there's a 'tempo' column, let's visualize the distribution of tempos
|
67 |
+
if 'tempo' in df.columns:
|
68 |
+
plt.figure(figsize=(10, 6))
|
69 |
+
df['tempo'].hist(bins=30)
|
70 |
+
plt.title('Distribution of Tempos in FMAHiphop Dataset')
|
71 |
+
plt.xlabel('Tempo (BPM)')
|
72 |
+
plt.ylabel('Count')
|
73 |
+
plt.show()
|
74 |
+
else:
|
75 |
+
print("\nNo 'tempo' column found in the dataset.")
|
76 |
+
|
77 |
+
# Example of how to access audio data (if available)
|
78 |
+
if 'audio' in first_example:
|
79 |
+
print("\nAudio data shape:", first_example['audio']['array'].shape)
|
80 |
+
print("Audio sampling rate:", first_example['audio']['sampling_rate'])
|
81 |
+
else:
|
82 |
+
print("\nNo 'audio' data found in the dataset examples.")
|
83 |
+
|
84 |
+
# Note: To play audio in a Jupyter notebook, you can use:
|
85 |
+
# from IPython.display import Audio
|
86 |
+
# Audio(first_example['audio']['array'], rate=first_example['audio']['sampling_rate'])
|
87 |
+
|
88 |
+
print("\nThis tutorial provides a basic exploration of the FMAHiphop dataset.")
|
89 |
+
print("You can expand on this to perform more advanced analyses or machine learning tasks.")
|