Delete Unconfirmed 788300.crdownload
Browse files- Unconfirmed 788300.crdownload +0 -275
Unconfirmed 788300.crdownload
DELETED
|
@@ -1,275 +0,0 @@
|
|
| 1 |
-
# -*- coding: utf-8 -*-
|
| 2 |
-
"""Untitled1.ipynb
|
| 3 |
-
|
| 4 |
-
Automatically generated by Colab.
|
| 5 |
-
|
| 6 |
-
Original file is located at
|
| 7 |
-
https://colab.research.google.com/drive/1ie2-yTR2nGSoxMTUf8xkXQxCeYInHcNt
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import pandas as pd
|
| 11 |
-
import numpy as np
|
| 12 |
-
import matplotlib.pyplot as plt
|
| 13 |
-
import seaborn as sns
|
| 14 |
-
import warnings
|
| 15 |
-
warnings.filterwarnings('ignore')
|
| 16 |
-
|
| 17 |
-
#set style for better visualization
|
| 18 |
-
plt.style.use('default')
|
| 19 |
-
sns.set_palette("husl")
|
| 20 |
-
|
| 21 |
-
#load the dataset of spotify music
|
| 22 |
-
import pandas as pd
|
| 23 |
-
|
| 24 |
-
pd.read_csv('Spotify.csv')
|
| 25 |
-
|
| 26 |
-
# Load the dataset
|
| 27 |
-
import numpy as np
|
| 28 |
-
data = pd.read_csv('Spotify.csv')
|
| 29 |
-
|
| 30 |
-
print(f"Dataset shape: {data.shape}")
|
| 31 |
-
print(f"Songs analyzed: {data.shape[0]}")
|
| 32 |
-
print(f"Total variables: {data.shape[1]}")
|
| 33 |
-
print(f"Numerical variables for analysis: {data.select_dtypes(include=[np.number]).shape[1]}")
|
| 34 |
-
|
| 35 |
-
# Explore the complexity of the decision space
|
| 36 |
-
print("Sample of songs and their characteristics:")
|
| 37 |
-
print(data.head())
|
| 38 |
-
print("\nDataset info:")
|
| 39 |
-
print(data.info())
|
| 40 |
-
|
| 41 |
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
| 42 |
-
|
| 43 |
-
# Prepare data for analysis - select only numerical variables
|
| 44 |
-
data_for_analysis = data.select_dtypes(include=[np.number])
|
| 45 |
-
print(f"Variables included in analysis: {list(data_for_analysis.columns)}")
|
| 46 |
-
print(f"Total variables: {len(data_for_analysis.columns)}")
|
| 47 |
-
|
| 48 |
-
"""1. Missing Values: No missing values were found. No action needed.
|
| 49 |
-
|
| 50 |
-
2. Irrelevant Features: We remove columns that are not numerical audio features or are identifiers.
|
| 51 |
-
|
| 52 |
-
artists, id, name, release_date: These are identifiers or text, not features for clustering.
|
| 53 |
-
|
| 54 |
-
year: We want to cluster based on sound, not the year it was released. Including it would bias the analysis.
|
| 55 |
-
|
| 56 |
-
explicit: Has only 14 non-zero entries. It is not a meaningful audio feature for this historical dataset.
|
| 57 |
-
|
| 58 |
-
3. Feature Selection: We keep the core numerical audio features provided by the Spotify API:
|
| 59 |
-
|
| 60 |
-
**acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, mode, speechiness, tempo, valence**
|
| 61 |
-
|
| 62 |
-
Scaling: The features are on vastly different scales (e.g., tempo ~100, loudness ~ -20 to 0). We use StandardScaler to standardize them. This is crucial for distance-based algorithms like PCA and K-Means, ensuring no single feature dominates the calculation due to its scale.
|
| 63 |
-
"""
|
| 64 |
-
|
| 65 |
-
from sklearn.preprocessing import StandardScaler
|
| 66 |
-
|
| 67 |
-
# Drop non-feature columns
|
| 68 |
-
features_to_drop = ['artists', 'id', 'name', 'release_date', 'year', 'explicit']
|
| 69 |
-
df_clean = data.drop(columns=features_to_drop)
|
| 70 |
-
|
| 71 |
-
# Define the features to scale
|
| 72 |
-
audio_features = ['acousticness', 'danceability', 'duration_ms', 'energy',
|
| 73 |
-
'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
|
| 74 |
-
'speechiness', 'tempo', 'valence', 'popularity']
|
| 75 |
-
|
| 76 |
-
# Scale the features
|
| 77 |
-
scaler = StandardScaler()
|
| 78 |
-
scaled_features = scaler.fit_transform(df_clean[audio_features])
|
| 79 |
-
df_scaled = pd.DataFrame(scaled_features, columns=audio_features)
|
| 80 |
-
|
| 81 |
-
"""We apply PCA to reduce the 13 dimensions into a smaller, more manageable set of components that capture most of the variance."""
|
| 82 |
-
|
| 83 |
-
from sklearn.decomposition import PCA
|
| 84 |
-
import numpy as np
|
| 85 |
-
|
| 86 |
-
# Drop rows with any missing values
|
| 87 |
-
df_scaled_cleaned = df_scaled.dropna()
|
| 88 |
-
|
| 89 |
-
# Apply PCA to our standardized data
|
| 90 |
-
pca = PCA()
|
| 91 |
-
pca_results = pca.fit_transform(df_scaled_cleaned)
|
| 92 |
-
|
| 93 |
-
# Examine explained variance
|
| 94 |
-
explained_variance_ratio = pca.explained_variance_ratio_
|
| 95 |
-
cumulative_variance = np.cumsum(explained_variance_ratio)
|
| 96 |
-
|
| 97 |
-
print("Explained Variance by Component:")
|
| 98 |
-
for i in range(min(10, len(explained_variance_ratio))):
|
| 99 |
-
print(f"PC{i+1}: {explained_variance_ratio[i]:.3f} ({explained_variance_ratio[i]*100:.1f}%)")
|
| 100 |
-
|
| 101 |
-
print(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance")
|
| 102 |
-
print(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance")
|
| 103 |
-
|
| 104 |
-
# Visualize the explained variance
|
| 105 |
-
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
|
| 106 |
-
|
| 107 |
-
# Scree plot
|
| 108 |
-
ax1.plot(range(1, min(16, len(explained_variance_ratio)+1)),
|
| 109 |
-
explained_variance_ratio[:15], 'bo-', linewidth=2, markersize=8)
|
| 110 |
-
ax1.set_xlabel('Principal Component')
|
| 111 |
-
ax1.set_ylabel('Explained Variance Ratio')
|
| 112 |
-
ax1.set_title('Scree Plot: Variance Explained by Each Component')
|
| 113 |
-
ax1.grid(True, alpha=0.3)
|
| 114 |
-
|
| 115 |
-
# Cumulative variance
|
| 116 |
-
ax2.plot(range(1, min(16, len(cumulative_variance)+1)),
|
| 117 |
-
cumulative_variance[:15], 'ro-', linewidth=2, markersize=8)
|
| 118 |
-
ax2.axhline(y=0.8, color='gray', linestyle='--', alpha=0.7, label='80% Variance')
|
| 119 |
-
ax2.axhline(y=0.9, color='gray', linestyle='--', alpha=0.5, label='90% Variance')
|
| 120 |
-
ax2.set_xlabel('Number of Components')
|
| 121 |
-
ax2.set_ylabel('Cumulative Explained Variance')
|
| 122 |
-
ax2.set_title('Cumulative Variance Explained')
|
| 123 |
-
ax2.legend()
|
| 124 |
-
ax2.grid(True, alpha=0.3)
|
| 125 |
-
|
| 126 |
-
plt.tight_layout()
|
| 127 |
-
plt.show()
|
| 128 |
-
|
| 129 |
-
# Examine the first few principal components in detail
|
| 130 |
-
components_df = pd.DataFrame(
|
| 131 |
-
pca.components_[:5].T, # First 5 components
|
| 132 |
-
columns=[f'PC{i+1}' for i in range(5)],
|
| 133 |
-
index=df_scaled.columns
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
print("Principal Component Loadings (How much each variable contributes):")
|
| 137 |
-
print(components_df.round(3))
|
| 138 |
-
|
| 139 |
-
# Apply PCA with 5 components
|
| 140 |
-
pca = PCA(n_components=5)
|
| 141 |
-
pca_features = pca.fit_transform(df_scaled_cleaned)
|
| 142 |
-
df_pca = pd.DataFrame(pca_features, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
|
| 143 |
-
|
| 144 |
-
from sklearn.decomposition import PCA
|
| 145 |
-
|
| 146 |
-
"""Business Interpretation of Top 5 Components:
|
| 147 |
-
|
| 148 |
-
PC1 (The "Orchestral & Acoustic" Dimension): High positive loading on acousticness and instrumentalness. High negative loading on energy and loudness. This dimension separates quiet, acoustic, instrumental pieces (e.g., classical piano) from loud, energetic, electronic music.
|
| 149 |
-
|
| 150 |
-
PC2 (The "Joyful Dance" Dimension): High positive loading on danceability and valence (positivity). This dimension captures how suitable a track is for dancing and its happy mood.
|
| 151 |
-
|
| 152 |
-
PC3 (The "Vocal Presence" Dimension): High positive loading on speechiness and liveness. High negative loading on instrumentalness. This separates spoken-word, live recordings, and rap from purely instrumental tracks.
|
| 153 |
-
|
| 154 |
-
PC4 (The "Tempo" Dimension): High positive loading on tempo. This is a relatively pure measure of the song's speed.
|
| 155 |
-
|
| 156 |
-
PC5 (The "Popularity" Dimension): High positive loading on popularity. This separates widely known tracks from more obscure ones.
|
| 157 |
-
|
| 158 |
-
Justification:
|
| 159 |
-
We choose K-Means clustering over Hierarchical clustering for the following reasons:
|
| 160 |
-
|
| 161 |
-
Computational Efficiency: Our dataset has 999 observations. K-Means, with a time complexity roughly linear in the number of objects (O(n)), is significantly more efficient than Hierarchical clustering, which has a quadratic complexity (O(n²)). This makes K-Means the standard choice for larger datasets.
|
| 162 |
-
|
| 163 |
-
Cluster Shape Expectation: After applying PCA, our data is transformed into components that are linearly uncorrelated. We expect the clusters in this transformed space to be roughly spherical and of similar size, which aligns perfectly with the assumptions of the K-Means algorithm (it uses Euclidean distance).
|
| 164 |
-
|
| 165 |
-
Interpretability: The goal is to create a clear, discrete segmentation of the songs. K-Means provides a flat set of clusters, which is often easier for stakeholders to understand and act upon compared to the complex dendrogram produced by Hierarchical clustering.
|
| 166 |
-
|
| 167 |
-
This method plots the Within-Cluster Sum of Squares (WCSS) or "inertia" against the number of clusters k. The optimal k is at the "elbow" of the curve, where the rate of decrease in inertia sharply changes.
|
| 168 |
-
|
| 169 |
-
The "elbow" of the graph appears at k=4 or k=5. There's a noticeable bend, and the rate of decrease in SSE slows down after that. We choose k=4 for a simpler, more interpretable segmentation.
|
| 170 |
-
"""
|
| 171 |
-
|
| 172 |
-
#| label: kmeans-elbow
|
| 173 |
-
|
| 174 |
-
# Perform elbow analysis
|
| 175 |
-
cluster_range = range(1, 11)
|
| 176 |
-
inertias = []
|
| 177 |
-
silhouette_scores = []
|
| 178 |
-
|
| 179 |
-
print("📊 Finding Optimal Number of Clusters...")
|
| 180 |
-
|
| 181 |
-
for k in cluster_range:
|
| 182 |
-
if k == 1:
|
| 183 |
-
inertias.append(np.sum(df_scaled.var() * len(df_scaled)))
|
| 184 |
-
silhouette_scores.append(0) # Can't calculate silhouette for k=1
|
| 185 |
-
else:
|
| 186 |
-
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 187 |
-
kmeans.fit(df_scaled)
|
| 188 |
-
inertias.append(kmeans.inertia_)
|
| 189 |
-
|
| 190 |
-
# Calculate silhouette score
|
| 191 |
-
silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
|
| 192 |
-
silhouette_scores.append(silhouette_avg)
|
| 193 |
-
|
| 194 |
-
# Plot elbow curve and silhouette scores
|
| 195 |
-
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
| 196 |
-
|
| 197 |
-
# Elbow plot
|
| 198 |
-
ax1.plot(cluster_range, inertias, 'o-', linewidth=2, markersize=8)
|
| 199 |
-
ax1.set_title('Elbow Method: Finding the Sweet Spot', fontweight='bold')
|
| 200 |
-
ax1.set_xlabel('Number of Clusters (k)')
|
| 201 |
-
ax1.set_ylabel('Within-Cluster Sum of Squares (Inertia)')
|
| 202 |
-
ax1.grid(True, alpha=0.3)
|
| 203 |
-
|
| 204 |
-
# Highlight potential elbow points
|
| 205 |
-
ax1.axvline(x=3, color='red', linestyle='--', alpha=0.7, label='k=3')
|
| 206 |
-
ax1.axvline(x=4, color='orange', linestyle='--', alpha=0.7, label='k=4')
|
| 207 |
-
ax1.axvline(x=5, color='green', linestyle='--', alpha=0.7, label='k=5')
|
| 208 |
-
ax1.legend()
|
| 209 |
-
|
| 210 |
-
# Silhouette plot
|
| 211 |
-
ax2.plot(cluster_range[1:], silhouette_scores[1:], 'o-', color='green', linewidth=2, markersize=8)
|
| 212 |
-
ax2.set_title('Cluster Quality (Silhouette Score)', fontweight='bold')
|
| 213 |
-
ax2.set_xlabel('Number of Clusters (k)')
|
| 214 |
-
ax2.set_ylabel('Silhouette Score')
|
| 215 |
-
ax2.grid(True, alpha=0.3)
|
| 216 |
-
ax2.axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Good threshold')
|
| 217 |
-
ax2.legend()
|
| 218 |
-
|
| 219 |
-
# Find optimal k by silhouette score
|
| 220 |
-
optimal_k = cluster_range[np.argmax(silhouette_scores[1:]) + 1] # +1 because we skip k=1
|
| 221 |
-
ax2.axvline(x=optimal_k, color='red', linestyle='-', alpha=0.8, label=f'Optimal k={optimal_k}')
|
| 222 |
-
ax2.legend()
|
| 223 |
-
|
| 224 |
-
plt.tight_layout()
|
| 225 |
-
plt.show()
|
| 226 |
-
|
| 227 |
-
print(f"📈 Elbow method suggests: Look for changes in slope around k=3-5")
|
| 228 |
-
print(f"🎯 Silhouette score optimal: k={optimal_k} (score: {max(silhouette_scores):.3f})")
|
| 229 |
-
|
| 230 |
-
#| label: kmeans-business-decision
|
| 231 |
-
|
| 232 |
-
# Compare different k values with business lens
|
| 233 |
-
print("💼 BUSINESS DECISION FRAMEWORK: Spotify Music Clustering")
|
| 234 |
-
print("=" * 50)
|
| 235 |
-
print("Evaluating different k values from technical and business perspectives:")
|
| 236 |
-
print()
|
| 237 |
-
|
| 238 |
-
# Define the range of k to evaluate
|
| 239 |
-
k_values = [3, 4, 5, 6]
|
| 240 |
-
|
| 241 |
-
for k in k_values:
|
| 242 |
-
kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
|
| 243 |
-
labels_temp = kmeans_temp.fit_predict(df_pca) # Using the PCA-reduced data for consistency
|
| 244 |
-
silhouette_temp = silhouette_score(df_pca, labels_temp)
|
| 245 |
-
|
| 246 |
-
# Calculate cluster sizes and their balance
|
| 247 |
-
cluster_sizes_temp = pd.Series(labels_temp).value_counts().sort_index()
|
| 248 |
-
size_ratio = cluster_sizes_temp.min() / cluster_sizes_temp.max() # Ratio of smallest to largest cluster
|
| 249 |
-
|
| 250 |
-
print(f"k={k}:")
|
| 251 |
-
print(f" Silhouette Score: {silhouette_temp:.3f}")
|
| 252 |
-
print(f" Cluster Sizes: {list(cluster_sizes_temp.values)}")
|
| 253 |
-
print(f" Balance (Min/Max Ratio): {size_ratio:.2f}")
|
| 254 |
-
print()
|
| 255 |
-
|
| 256 |
-
print(f"🎯 BUSINESS INTERPRETATION & TRADEOFFS:")
|
| 257 |
-
print(f"• k=3: Broad, high-level segments (e.g., 'Popular Hits', 'Serious Listening', 'Other').")
|
| 258 |
-
print(" PRO: Simple story, easy to action. CON: Might mask important nuances between acoustic genres.")
|
| 259 |
-
print(f"• k=4: Clear, distinct archetypes (e.g., 'Dance Hits', 'Classical', 'Spoken Word', 'Folk/Traditional').")
|
| 260 |
-
print(" PRO: Excellent balance of specificity and actionability. Our likely choice.")
|
| 261 |
-
print(f"• k=5: Introduces a more niche specialization.")
|
| 262 |
-
print(" PRO: Captures finer details. CON: May split a logical group into overly-specific segments.")
|
| 263 |
-
print(f"• k=6: Highly granular.")
|
| 264 |
-
print(" CON: Risk of over-segmentation; clusters may become too small and hard to market to.")
|
| 265 |
-
print()
|
| 266 |
-
|
| 267 |
-
# Make the business decision
|
| 268 |
-
final_k = 4 # The business decision is to use 4 clusters
|
| 269 |
-
print(f"🏆 RECOMMENDED BUSINESS DECISION: k={final_k}")
|
| 270 |
-
print("Reasoning: k=4 provides the optimal balance for our use case:")
|
| 271 |
-
print("1. STATISTICAL: It has a high silhouette score, indicating well-defined clusters.")
|
| 272 |
-
print("2. ACTIONABILITY: It yields a manageable number of distinct audience segments for our marketing and product teams.")
|
| 273 |
-
print("3. INTERPRETABILITY: Each cluster has a clear, intuitive 'sound profile' and real-world meaning.")
|
| 274 |
-
print("4. BALANCE: The clusters are reasonably balanced in size, preventing one segment from being too niche.")
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|