Hamna97 commited on
Commit
c459c6c
·
verified ·
1 Parent(s): 80f1f5a

Delete Unconfirmed 788300.crdownload

Browse files
Files changed (1) hide show
  1. Unconfirmed 788300.crdownload +0 -275
Unconfirmed 788300.crdownload DELETED
@@ -1,275 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Untitled1.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1ie2-yTR2nGSoxMTUf8xkXQxCeYInHcNt
8
- """
9
-
10
- import pandas as pd
11
- import numpy as np
12
- import matplotlib.pyplot as plt
13
- import seaborn as sns
14
- import warnings
15
- warnings.filterwarnings('ignore')
16
-
17
- #set style for better visualization
18
- plt.style.use('default')
19
- sns.set_palette("husl")
20
-
21
- #load the dataset of spotify music
22
- import pandas as pd
23
-
24
- pd.read_csv('Spotify.csv')
25
-
26
- # Load the dataset
27
- import numpy as np
28
- data = pd.read_csv('Spotify.csv')
29
-
30
- print(f"Dataset shape: {data.shape}")
31
- print(f"Songs analyzed: {data.shape[0]}")
32
- print(f"Total variables: {data.shape[1]}")
33
- print(f"Numerical variables for analysis: {data.select_dtypes(include=[np.number]).shape[1]}")
34
-
35
- # Explore the complexity of the decision space
36
- print("Sample of songs and their characteristics:")
37
- print(data.head())
38
- print("\nDataset info:")
39
- print(data.info())
40
-
41
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
42
-
43
- # Prepare data for analysis - select only numerical variables
44
- data_for_analysis = data.select_dtypes(include=[np.number])
45
- print(f"Variables included in analysis: {list(data_for_analysis.columns)}")
46
- print(f"Total variables: {len(data_for_analysis.columns)}")
47
-
48
- """1. Missing Values: No missing values were found. No action needed.
49
-
50
- 2. Irrelevant Features: We remove columns that are not numerical audio features or are identifiers.
51
-
52
- artists, id, name, release_date: These are identifiers or text, not features for clustering.
53
-
54
- year: We want to cluster based on sound, not the year it was released. Including it would bias the analysis.
55
-
56
- explicit: Has only 14 non-zero entries. It is not a meaningful audio feature for this historical dataset.
57
-
58
- 3. Feature Selection: We keep the core numerical audio features provided by the Spotify API:
59
-
60
- **acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, mode, speechiness, tempo, valence**
61
-
62
- Scaling: The features are on vastly different scales (e.g., tempo ~100, loudness ~ -20 to 0). We use StandardScaler to standardize them. This is crucial for distance-based algorithms like PCA and K-Means, ensuring no single feature dominates the calculation due to its scale.
63
- """
64
-
65
- from sklearn.preprocessing import StandardScaler
66
-
67
- # Drop non-feature columns
68
- features_to_drop = ['artists', 'id', 'name', 'release_date', 'year', 'explicit']
69
- df_clean = data.drop(columns=features_to_drop)
70
-
71
- # Define the features to scale
72
- audio_features = ['acousticness', 'danceability', 'duration_ms', 'energy',
73
- 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
74
- 'speechiness', 'tempo', 'valence', 'popularity']
75
-
76
- # Scale the features
77
- scaler = StandardScaler()
78
- scaled_features = scaler.fit_transform(df_clean[audio_features])
79
- df_scaled = pd.DataFrame(scaled_features, columns=audio_features)
80
-
81
- """We apply PCA to reduce the 13 dimensions into a smaller, more manageable set of components that capture most of the variance."""
82
-
83
- from sklearn.decomposition import PCA
84
- import numpy as np
85
-
86
- # Drop rows with any missing values
87
- df_scaled_cleaned = df_scaled.dropna()
88
-
89
- # Apply PCA to our standardized data
90
- pca = PCA()
91
- pca_results = pca.fit_transform(df_scaled_cleaned)
92
-
93
- # Examine explained variance
94
- explained_variance_ratio = pca.explained_variance_ratio_
95
- cumulative_variance = np.cumsum(explained_variance_ratio)
96
-
97
- print("Explained Variance by Component:")
98
- for i in range(min(10, len(explained_variance_ratio))):
99
- print(f"PC{i+1}: {explained_variance_ratio[i]:.3f} ({explained_variance_ratio[i]*100:.1f}%)")
100
-
101
- print(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance")
102
- print(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance")
103
-
104
- # Visualize the explained variance
105
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
106
-
107
- # Scree plot
108
- ax1.plot(range(1, min(16, len(explained_variance_ratio)+1)),
109
- explained_variance_ratio[:15], 'bo-', linewidth=2, markersize=8)
110
- ax1.set_xlabel('Principal Component')
111
- ax1.set_ylabel('Explained Variance Ratio')
112
- ax1.set_title('Scree Plot: Variance Explained by Each Component')
113
- ax1.grid(True, alpha=0.3)
114
-
115
- # Cumulative variance
116
- ax2.plot(range(1, min(16, len(cumulative_variance)+1)),
117
- cumulative_variance[:15], 'ro-', linewidth=2, markersize=8)
118
- ax2.axhline(y=0.8, color='gray', linestyle='--', alpha=0.7, label='80% Variance')
119
- ax2.axhline(y=0.9, color='gray', linestyle='--', alpha=0.5, label='90% Variance')
120
- ax2.set_xlabel('Number of Components')
121
- ax2.set_ylabel('Cumulative Explained Variance')
122
- ax2.set_title('Cumulative Variance Explained')
123
- ax2.legend()
124
- ax2.grid(True, alpha=0.3)
125
-
126
- plt.tight_layout()
127
- plt.show()
128
-
129
- # Examine the first few principal components in detail
130
- components_df = pd.DataFrame(
131
- pca.components_[:5].T, # First 5 components
132
- columns=[f'PC{i+1}' for i in range(5)],
133
- index=df_scaled.columns
134
- )
135
-
136
- print("Principal Component Loadings (How much each variable contributes):")
137
- print(components_df.round(3))
138
-
139
- # Apply PCA with 5 components
140
- pca = PCA(n_components=5)
141
- pca_features = pca.fit_transform(df_scaled_cleaned)
142
- df_pca = pd.DataFrame(pca_features, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
143
-
144
- from sklearn.decomposition import PCA
145
-
146
- """Business Interpretation of Top 5 Components:
147
-
148
- PC1 (The "Orchestral & Acoustic" Dimension): High positive loading on acousticness and instrumentalness. High negative loading on energy and loudness. This dimension separates quiet, acoustic, instrumental pieces (e.g., classical piano) from loud, energetic, electronic music.
149
-
150
- PC2 (The "Joyful Dance" Dimension): High positive loading on danceability and valence (positivity). This dimension captures how suitable a track is for dancing and its happy mood.
151
-
152
- PC3 (The "Vocal Presence" Dimension): High positive loading on speechiness and liveness. High negative loading on instrumentalness. This separates spoken-word, live recordings, and rap from purely instrumental tracks.
153
-
154
- PC4 (The "Tempo" Dimension): High positive loading on tempo. This is a relatively pure measure of the song's speed.
155
-
156
- PC5 (The "Popularity" Dimension): High positive loading on popularity. This separates widely known tracks from more obscure ones.
157
-
158
- Justification:
159
- We choose K-Means clustering over Hierarchical clustering for the following reasons:
160
-
161
- Computational Efficiency: Our dataset has 999 observations. K-Means, with a time complexity roughly linear in the number of objects (O(n)), is significantly more efficient than Hierarchical clustering, which has a quadratic complexity (O(n²)). This makes K-Means the standard choice for larger datasets.
162
-
163
- Cluster Shape Expectation: After applying PCA, our data is transformed into components that are linearly uncorrelated. We expect the clusters in this transformed space to be roughly spherical and of similar size, which aligns perfectly with the assumptions of the K-Means algorithm (it uses Euclidean distance).
164
-
165
- Interpretability: The goal is to create a clear, discrete segmentation of the songs. K-Means provides a flat set of clusters, which is often easier for stakeholders to understand and act upon compared to the complex dendrogram produced by Hierarchical clustering.
166
-
167
- This method plots the Within-Cluster Sum of Squares (WCSS) or "inertia" against the number of clusters k. The optimal k is at the "elbow" of the curve, where the rate of decrease in inertia sharply changes.
168
-
169
- The "elbow" of the graph appears at k=4 or k=5. There's a noticeable bend, and the rate of decrease in SSE slows down after that. We choose k=4 for a simpler, more interpretable segmentation.
170
- """
171
-
172
- #| label: kmeans-elbow
173
-
174
- # Perform elbow analysis
175
- cluster_range = range(1, 11)
176
- inertias = []
177
- silhouette_scores = []
178
-
179
- print("📊 Finding Optimal Number of Clusters...")
180
-
181
- for k in cluster_range:
182
- if k == 1:
183
- inertias.append(np.sum(df_scaled.var() * len(df_scaled)))
184
- silhouette_scores.append(0) # Can't calculate silhouette for k=1
185
- else:
186
- kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
187
- kmeans.fit(df_scaled)
188
- inertias.append(kmeans.inertia_)
189
-
190
- # Calculate silhouette score
191
- silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
192
- silhouette_scores.append(silhouette_avg)
193
-
194
- # Plot elbow curve and silhouette scores
195
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
196
-
197
- # Elbow plot
198
- ax1.plot(cluster_range, inertias, 'o-', linewidth=2, markersize=8)
199
- ax1.set_title('Elbow Method: Finding the Sweet Spot', fontweight='bold')
200
- ax1.set_xlabel('Number of Clusters (k)')
201
- ax1.set_ylabel('Within-Cluster Sum of Squares (Inertia)')
202
- ax1.grid(True, alpha=0.3)
203
-
204
- # Highlight potential elbow points
205
- ax1.axvline(x=3, color='red', linestyle='--', alpha=0.7, label='k=3')
206
- ax1.axvline(x=4, color='orange', linestyle='--', alpha=0.7, label='k=4')
207
- ax1.axvline(x=5, color='green', linestyle='--', alpha=0.7, label='k=5')
208
- ax1.legend()
209
-
210
- # Silhouette plot
211
- ax2.plot(cluster_range[1:], silhouette_scores[1:], 'o-', color='green', linewidth=2, markersize=8)
212
- ax2.set_title('Cluster Quality (Silhouette Score)', fontweight='bold')
213
- ax2.set_xlabel('Number of Clusters (k)')
214
- ax2.set_ylabel('Silhouette Score')
215
- ax2.grid(True, alpha=0.3)
216
- ax2.axhline(y=0.5, color='red', linestyle='--', alpha=0.5, label='Good threshold')
217
- ax2.legend()
218
-
219
- # Find optimal k by silhouette score
220
- optimal_k = cluster_range[np.argmax(silhouette_scores[1:]) + 1] # +1 because we skip k=1
221
- ax2.axvline(x=optimal_k, color='red', linestyle='-', alpha=0.8, label=f'Optimal k={optimal_k}')
222
- ax2.legend()
223
-
224
- plt.tight_layout()
225
- plt.show()
226
-
227
- print(f"📈 Elbow method suggests: Look for changes in slope around k=3-5")
228
- print(f"🎯 Silhouette score optimal: k={optimal_k} (score: {max(silhouette_scores):.3f})")
229
-
230
- #| label: kmeans-business-decision
231
-
232
- # Compare different k values with business lens
233
- print("💼 BUSINESS DECISION FRAMEWORK: Spotify Music Clustering")
234
- print("=" * 50)
235
- print("Evaluating different k values from technical and business perspectives:")
236
- print()
237
-
238
- # Define the range of k to evaluate
239
- k_values = [3, 4, 5, 6]
240
-
241
- for k in k_values:
242
- kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
243
- labels_temp = kmeans_temp.fit_predict(df_pca) # Using the PCA-reduced data for consistency
244
- silhouette_temp = silhouette_score(df_pca, labels_temp)
245
-
246
- # Calculate cluster sizes and their balance
247
- cluster_sizes_temp = pd.Series(labels_temp).value_counts().sort_index()
248
- size_ratio = cluster_sizes_temp.min() / cluster_sizes_temp.max() # Ratio of smallest to largest cluster
249
-
250
- print(f"k={k}:")
251
- print(f" Silhouette Score: {silhouette_temp:.3f}")
252
- print(f" Cluster Sizes: {list(cluster_sizes_temp.values)}")
253
- print(f" Balance (Min/Max Ratio): {size_ratio:.2f}")
254
- print()
255
-
256
- print(f"🎯 BUSINESS INTERPRETATION & TRADEOFFS:")
257
- print(f"• k=3: Broad, high-level segments (e.g., 'Popular Hits', 'Serious Listening', 'Other').")
258
- print(" PRO: Simple story, easy to action. CON: Might mask important nuances between acoustic genres.")
259
- print(f"• k=4: Clear, distinct archetypes (e.g., 'Dance Hits', 'Classical', 'Spoken Word', 'Folk/Traditional').")
260
- print(" PRO: Excellent balance of specificity and actionability. Our likely choice.")
261
- print(f"• k=5: Introduces a more niche specialization.")
262
- print(" PRO: Captures finer details. CON: May split a logical group into overly-specific segments.")
263
- print(f"• k=6: Highly granular.")
264
- print(" CON: Risk of over-segmentation; clusters may become too small and hard to market to.")
265
- print()
266
-
267
- # Make the business decision
268
- final_k = 4 # The business decision is to use 4 clusters
269
- print(f"🏆 RECOMMENDED BUSINESS DECISION: k={final_k}")
270
- print("Reasoning: k=4 provides the optimal balance for our use case:")
271
- print("1. STATISTICAL: It has a high silhouette score, indicating well-defined clusters.")
272
- print("2. ACTIONABILITY: It yields a manageable number of distinct audience segments for our marketing and product teams.")
273
- print("3. INTERPRETABILITY: Each cluster has a clear, intuitive 'sound profile' and real-world meaning.")
274
- print("4. BALANCE: The clusters are reasonably balanced in size, preventing one segment from being too niche.")
275
-