kaggle-top50

The top50 data is a set of music data from 👉 Kaggle.

There are 50 songs and 13 variables to be explored

The new knowledge

The data itself is relatively perfect, does not involve too much data preprocessing work, is mainly learned to draw a variety of graphics

  • histogram
  • Histogram + polyline
  • Heat map
  • The pie chart
  • Contour map

attribute


The analysis process

Import libraries and packages

import pandas as pd

import numpy as np

import matplotlib as mpl

import matplotlib.pyplot as plt

from scipy import stats

import squarify as sq

from pandas.plotting import scatter_matrix

import seaborn as sns

import sklearn 

import warnings

warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, LabelEncoder  # Preprocessing module

from sklearn.linear_model import LinearRegression  # Linear regression

from sklearn.model_selection import train_test_split,cross_val_score, KFold   # Data separation, cross validation, K-fold validation

from sklearn import metrics   # Matrix module

from sklearn.metrics import confusion_matrix, classification_report  # Confusion matrix, classification report

%matplotlib inline





# Support for Chinese characters

mpl.rcParams["font.family"] ="sans-serif"

mpl.rcParams["font.sans-serif"] =u'SimHei'

Copy the code

The data view

filename='/Users/piqianchao/data-visualization/top50.csv'

data = pd.read_csv(filename

                   ,encoding = "ISO-8859-1" # Resolve UnicodeError

                   ,engine='python'

                   ,index_col=0)  # Solve the problem where the first column of a known file is treated as an attribute

data.head()

Copy the code

Rename property

data.rename(columns={'Track.Name':'track_name'.'Artist.Name':'artist_name'.'Beats.Per.Minute':'beats_per_minute'.'Loudness.. dB.. ':'Loudness(dB)'.'Valence.':'Valence'.'Length.':'Length'.'Acousticness.. ':'Acousticness'.'Speechiness.':'Speechiness'},inplace=True)

Copy the code

Calculating the number of songs of each genre

popular_genre = data.groupby('Genre').size()  Group by category and count how many songs are in each category

print(popular_genre)

genre_list = data['Genre'].values.tolist()  Convert each category into a list

Copy the code

Calculating the number of songs by each of the artists

popular_artist = data.groupby('artist_name').size()   # Count how many songs per writer

print(popular_artist)

artist_list = data['artist_name'].values.tolist()   # Change the names of writers to a list

Copy the code

View statistics about attributes

pd.set_option('precision'.3)  # set the maximum number of decimal places to display

data.describe()  # View statistics

Copy the code

Finding out the skew for each attribute

Find the skew for each attribute

skew = data.skew()  # skew is skewness, skewness coefficient

print(skew)

Copy the code

transform = np.asarray(data[['Liveness']].values)  Take the value of each Liveness and convert it to NDARray type data

print(type(transform))

data_transform = stats.boxcox(transform)[0]



plt.hist(data['Liveness'], bins=10)   # Raw data

plt.title("original data")

plt.show()



plt.hist(data_transform, bins=10)  # Correct data after skewness

plt.title("skew corrected data")

plt.show()

Copy the code

How to draw a broken line trend based on a histogram

transform1 = np.asarray(data[['Popularity']].values)

data_transform1 = stats.boxcox(transform1)[0]

# Like above, draw the histogram

# plt.hist(data['Popularity'],bins=10) #original data

# plt.show()

# plt.hist(data_transform1,bins=10) #corrected skew data

# plt.show()



sns.distplot(data['Popularity'],bins=10,kde=True,kde_kws={"color":"k"."lw":2."label":"KDE"}, color='blue')

plt.title("original data")

plt.show()



sns.distplot(data_transform1, bins=10, kde=True, kde_kws={"color":"k"."lw":2."label":"KDE"}, color='green')

plt.title("skew corrected data")

plt.show()

Copy the code

Bar graph to see the number of songs of each genre

fig, ax = plt.subplots(figsize=(30.12))  # specify canvas size

length = np.arange(len(popular_genre))

plt.bar(length, popular_genre, color='g',edgecolor='black',alpha=0.7)



plt.xticks(length, genre_list)  # displays each scale on the horizontal axis

plt.title("Most popular genre", fontsize=28)

plt.xlabel("Genre", fontsize=25)

plt.ylabel("Number On Songs", fontsize=25)

plt.show()

Copy the code

Correction of correlation coefficient

How to solve the correlation coefficient

pd.set_option('display.width'.100)   The maximum amount of data to be displayed per line is 100

pd.set_option('precision'.3)  # The most exact decimal place

correclation = data.corr(method='spearman'# Method coefficient correlation: correlation between Pearson linear data; Kendall classification variable correlation, unordered sequence; Spearman nonlinear, non-normal data correlation coefficient

print(correclation)

Copy the code

8.2 Draw a thermal diagram according to the correlation coefficient

plt.figure(figsize=(10.10))

plt.title("Correclation heatmap")

sns.heatmap(correclation, annot=True,vmin=- 1, vmax=1,cmap="GnBu_r", center=1)

Copy the code

barh of most popular artists

fig, ax=plt.subplots(figsize=(12.12))

length=np.arange(len(popular_artist))  

plt.barh(length, popular_artist,color='r',edgecolor='black',alpha=0.7)

# plt.barh(y, width, height=0, left=None, *, align='center', **kwargs) # plt.barh(y, width, height=0, left=None, *, align='center', **kwargs)

plt.yticks(length, artist_list)   # the scale on the Y-axis



plt.title("Most popular artists", fontsize=18)

plt.ylabel("Artists", fontsize=18)   # horizontal and vertical axis labels

plt.xlabel("Number of songs", fontsize=16)

plt.show()

Copy the code

Analysing the relationship between energy and loudness

fig = plt.subplots(figsize=(10.10))

sns.regplot(x='Energy', y='Loudness(dB)', data=data, color='black')

Copy the code

Dependence between energy and popularity

fig = plt.subplots(figsize=(10.10))

plt.title('Dependence between energy and popularity')

sns.regplot(x='Energy', y='Popularity', ci=None, data=data)

sns.kdeplot(data.Energy, data.Popularity)

Copy the code

plt.figure(figsize=(14.8))

sq.plot(sizes=data.Genre.value_counts(), label=data['Genre'].unique(), alpha=0.8)

plt.axis('off')

plt.show()

Copy the code

Pie charts Pie chart

Create a pie chart based on each artist and the number of songs

labels = data.artist_name.value_counts().index  # tags per small piece

sizes = data.artist_name.value_counts().values  # Size of block

colors = ['red'.'yellowgreen'.'lightcoral'.'lightskyblue'.'cyan'.'green'.'black'.'yellow']

plt.figure(figsize = (10.10))

plt.pie(sizes, labels=labels,colors=colors)  # drawing

autopct = ("% 1.1 f % %")

plt.axis('equal')

plt.show()

Copy the code

Linear Regression

Data build and TTS

# Build training sets and test sets

x = data.loc[:, ['Energy'.'Danceability'.'Length'.'Loudness(dB)'.'Acousticness']].values

y = data.loc[:, 'Popularity'].values



X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)



reg = LinearRegression()

reg.fit(X_train, y_train)

Copy the code

To predict

# Make a prediction, a comparison between the actual value and the predicted value

y_pred = reg.predict(X_test)

data_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

print(data_output)

Copy the code

MAE:mean absolute error; MSE: mean sqaured error

print("MAE", metrics.mean_absolute_error(y_test, y_pred))

print("MSE", metrics.mean_squared_error(y_test, y_pred))

print("Root MSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))



# Scatter plot between predicted value and actual test value

plt.figure(figsize=(10.10))

plt.plot(y_pred, y_test, color='black', linestyle='dashed',marker=The '*',markerfacecolor='red',markersize=10)

plt.title("Error analsis")

plt.xlabel("Predicted values")

plt.ylabel("Test values")

Copy the code

Cross validation

x = data.loc[:, ['Energy'.'Danceability']].values

y = data.loc[:, 'Popularity'].values

reg = LinearRegression()

mse = cross_val_score(reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

mean_mse = np.mean(mse)

print(mean_mse)

diff = metrics.mean_squared_error(y_test, y_pred) - abs(mean_mse)

print(diff)

Copy the code