# Tutorial: Using the FMAHiphop Dataset # Installation Instructions: # Before running this script, make sure you have the required libraries installed. # You can install them using pip: # # pip install datasets pandas matplotlib # # If you're using a Jupyter notebook, you can run this in a cell: # !pip install datasets pandas matplotlib # First, let's import the necessary libraries try: from datasets import load_dataset except ImportError: print("The 'datasets' library is not installed. Please install it using: pip install datasets") exit(1) try: import pandas as pd except ImportError: print("The 'pandas' library is not installed. Please install it using: pip install pandas") exit(1) try: import matplotlib.pyplot as plt except ImportError: print("The 'matplotlib' library is not installed. Please install it using: pip install matplotlib") exit(1) # Load the dataset try: ds = load_dataset("Nkumar5/FMAHiphop") except Exception as e: print(f"Error loading the dataset: {e}") print("Please check your internet connection and ensure the dataset name is correct.") exit(1) # Let's explore the dataset structure print("Dataset structure:") print(ds) # Look at the features of the training set print("\nFeatures in the training set:") print(ds['train'].features) # Get the first example from the training set first_example = ds['train'][0] print("\nFirst example:") print(first_example) # Convert the dataset to a pandas DataFrame for easier manipulation df = pd.DataFrame(ds['train']) # Basic statistics of the dataset print("\nDataset statistics:") print(df.describe()) # If there's an 'artist' column, let's see the most common artists if 'artist' in df.columns: print("\nTop 10 artists by track count:") print(df['artist'].value_counts().head(10)) else: print("\nNo 'artist' column found in the dataset.") # If there's a 'tempo' column, let's visualize the distribution of tempos if 'tempo' in df.columns: plt.figure(figsize=(10, 6)) df['tempo'].hist(bins=30) plt.title('Distribution of Tempos in FMAHiphop Dataset') plt.xlabel('Tempo (BPM)') plt.ylabel('Count') plt.show() else: print("\nNo 'tempo' column found in the dataset.") # Example of how to access audio data (if available) if 'audio' in first_example: print("\nAudio data shape:", first_example['audio']['array'].shape) print("Audio sampling rate:", first_example['audio']['sampling_rate']) else: print("\nNo 'audio' data found in the dataset examples.") # Note: To play audio in a Jupyter notebook, you can use: # from IPython.display import Audio # Audio(first_example['audio']['array'], rate=first_example['audio']['sampling_rate']) print("\nThis tutorial provides a basic exploration of the FMAHiphop dataset.") print("You can expand on this to perform more advanced analyses or machine learning tasks.")