“This is the 28th day of my participation in the Gwen Challenge in November. Check out the details: The Last Gwen Challenge in 2021.”

EDA on data sets: juejin.cn/post/703488…

The following data analysis and mining, mainly to complete the sales forecast for users to buy goods

Data set split Settings

The ratio of training set and verifier is set to 7:3, and the data distribution is visualized

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # Data segmentation
import warnings## Ignore warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
# Set displayable Chinese font for drawing
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'

df = pd.read_csv("product_data.csv")
X = df.drop(["Purchase"],axis=1)
y = df.Purchase
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
Training set size Test set size
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
# Observe the consistency of partitioned datasets, using hIST to view the distribution of purchases
colors = ['salmon'.'lightskyblue'.'#FFF68F'.'palegreen'.'lightpink'.'silver'.'burlywood'.'plum'.'rosybrown']
plt.figure(figsize=(12.5))
plt.subplot(131)

ax1=plt.subplot(1.3.1)
ax1.hist(y_train,color=colors[0])
ax1.set_title("Training set distribution")

ax2=plt.subplot(1.3.2)
ax2.hist(y_test,color=colors[1])
ax2.set_title("Test set distribution map")

ax3=plt.subplot(1.3.3)
ax3.hist(y_test,color=colors[0])
ax3.hist(y_test,color=colors[1])
ax3.set_title("Comparison distribution of Training Set and Test Set")
plt.show()

df_train = pd.concat([X_train,y_train],axis=1)
df_test = pd.concat([X_test,y_test],axis=1)
df_train.to_csv("./data/train.csv",index=False)
df_test.to_csv("./data/test.csv",index=False)
Copy the code

Output :(376303, 11) (161274, 11) (376303,) (161274,)

Data cleaning

Import the base package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as ppf
import warnings## Ignore warnings
warnings.filterwarnings('ignore')
# Set displayable Chinese font for drawing
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.style.use('ggplot')
# Import package to do feature engineering
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder# tag encoding
from sklearn.preprocessing import RobustScaler, StandardScaler# Remove outliers and standardize data
from sklearn.pipeline import Pipeline, make_pipeline# Build pipeline
from scipy.stats import skew# of skewness
from sklearn import impute

# link two tables
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
full = pd.concat([train,test],ignore_index=True)
full.head()
Copy the code

Data cleaning is divided into three parts: 1. Filling missing values 2. Solving wrong values 3. Deal with inconsistencies in numbers

Missing value handling

Start by deleting the ID column and sales

full.drop(["User_ID"."Product_ID"."Purchase"],axis=1,inplace=True)
full.Product_Category_2.value_counts()
Copy the code

Product_Category_2 and Product_Category_3 are classification variables. Mode padding is adopted first

Full [col].mode()[0
cols1 = ["Product_Category_2"."Product_Category_3"]
for col in cols1:
    full[col].fillna(full[col].mode()[0],inplace=True)
Copy the code

String variables are encoded as numeric variables

String variables for gender, age, city level, year of settlement and so on are converted into category codes gender: ‘male ‘:0,’ female ‘: 1 Age: ‘0-17:0,’ 18-25:1, ’26-35:2,’ 36-45:3, ’46-50′, 4, ’51 and 55′, 5, ’55 + : 6 city level:’ A ‘: 0, “B” : 1, “C” : settle 2 year: ‘0’, 0 ‘1’ : 1, “2” : 2, “3” : 3, 4 ‘+’ : 4

# gender
full.Gender = full.Gender.map({'F':0."M":1}).astype(int)
# age
full.Age = full.Age.map({'0-17':0.'18 to 25':1.'26-35':2.'36-45':3.'46-50':4.'51 and 55':5.55 + ' ':6}).astype(int)
# City level
full.City_Category = full.City_Category.map({'A':0."B":1."C":2}).astype(int)
# Year of settlement
full.Stay_In_Current_City_Years = full.Stay_In_Current_City_Years.map({'0':0.'1':1."2":2."3":3.'4 +':4}).astype(int)
Copy the code

Converting data types

Converts a numeric column to the corresponding data type

cols1 = ["Gender"."Age"."Occupation"."City_Category"."Stay_In_Current_City_Years"."Marital_Status"."Product_Category_1",]
for col in cols1:
    full[col] = full[col].astype(np.int64)
Copy the code

Break it down and standardize it

standardized

n_train=train.shape[0]The number of rows in the training set
X = full[:n_train]Extract the training set after processing
test_X = full[n_train:]Select data after n_train as test set

y= train.Purchase
X_scaled = StandardScaler().fit(X).transform(X)# do conversion
y_log = np.log(train.Purchase)## The thing to notice here is that it's more like a normal distribution

Get the test set
test_X_scaled = StandardScaler().fit_transform(test_X)
Copy the code

Feature selection

Lasso Lasso regression was used for feature selection

from sklearn.linear_model import Lasso## The importance of using algorithms to train sets to get features. One function of feature selection is the Wrapper base model
lasso=Lasso(alpha=0.001)
lasso.fit(X_scaled,y_log)

FI_lasso = pd.DataFrame({"Feature Importance":lasso.coef_}, index=full.columns)# Index and importance in dataframe format

FI_lasso.sort_values("Feature Importance",ascending=False)# Sort from highest to lowest
Copy the code

FI_lasso[FI_lasso["Feature Importance"]! =0].sort_values("Feature Importance").plot(kind="barh",color="salmon",figsize=(10.8))
plt.title("Significance diagram of Feature Selection")
plt.xticks(rotation=90)
plt.show()## Drawing display
Copy the code

Feature selection doesn’t vary much, so use all features

Characteristics of the structure

The subsequent prediction was poor, the data were re-analyzed and the features were constructed

    # Average sales per unit
    avg_purchase_per_product=pd.DataFrame(full.groupby(['Product_ID'[])'Purchase'].mean())
    avg_purchase_per_product.reset_index(inplace=True)
    # Average sales per user
    avg_purchase_per_user=pd.DataFrame(full.groupby(['User_ID'[])'Purchase'].mean())
    avg_purchase_per_user.reset_index(inplace=True)   
    # Times of purchase
    product_count=pd.DataFrame(full['Product_ID'].value_counts())
    product_count.reset_index(inplace=True)
    product_count=product_count.rename(columns={'index':'Product_ID'.'Product_ID':'Product_count'})
    # Add three new features
    full['avg_purchase_per_product']=full['Product_ID'].map(avg_purchase_per_product.set_index('Product_ID') ['Purchase'])
    full['product_count']=full['Product_ID'].map(product_count.set_index('Product_ID') ['Product_count'])
    full['avg_purchase_per_user']=full['User_ID'].map(avg_purchase_per_user.set_index('User_ID') ['Purchase'])
    # Count invisible commodity category losses as a feature
    conditions = [
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] = =0) & (full['Product_Category_3'] = =0),
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] = =0),
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] != 0)] 
    choices = [1.2.3]
    Add item category count
    full['Category_Count'] = np.select(conditions, choices, default=0)
Copy the code

After only preprocessing the above data, the model effect of cashback prediction of user consumption amount is not ideal through multiple methods such as single model test and multi-model fusion. It is difficult to improve the algorithm model, so we hope to carry out feature engineering from the data level. The feature dimensions of the original data set are not many and the importance of features is very weak, so the strategy we adopt is to generate high correlation coefficient features through feature creation. A total of four dimensions of features are added. The creation steps of feature dimensions are as follows:

1. Category_counts

Whether it is the thermal map of data visualization or the important coefficient in feature selection, the correlation of commodity categories is very high, which shows that commodity categories have certain mining value. Previously, we have always considered item category 2 and item category 3 as missing values, but it is also likely that consumers did not purchase items of item category 2 and item category 3, so the number of different items per user is a counting indicator.

2. Product_counts Total consumption per Product_ID

3. Average consumption amount avg_purchase_per_product for each User_ID

4. Average consumption amount for each Product_ID, AVg_purchase_per_Product

We hope to mine more hidden information about goods. Product_ID is an unused feature. The total amount of this data set is more than 500,000 pieces, but there are only thousands of users and goods. Therefore, the total consumption amount of each Product_ID is counted, and then the average consumption amount of each User_ID is calculated, and the average consumption amount of each Product_ID is calculated.

Forecast sales – regression questions

Import the base package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings## Ignore warnings
warnings.filterwarnings('ignore')
# Set displayable Chinese font for drawing
plt.rcParams['font.sans-serif'] = 'Microsoft YaHei'
plt.style.use('ggplot')

# data preprocessor function
def Data_Clearing(full) :    
    Remove the string contained in the commodity ID
    full['Product_ID']=full['Product_ID'].str.slice(2).astype(int)
    Fill in the missing values with zero
    cols1 = ["Product_Category_2"."Product_Category_3"]
    for col in cols1:
        full[col].fillna(0,inplace=True)
    # gender
    full.Gender = full.Gender.map({'F':0."M":1}).astype(int)
    # age
    full.Age = full.Age.map({'0-17':17.'18 to 25':25.'26-35':35.'36-45':45.'46-50':50.'51 and 55':55.55 + ' ':60}).astype(int)
    # City level
    full.City_Category = full.City_Category.map({'A':0."B":1."C":2}).astype(int)
    # Year of settlement
    full.Stay_In_Current_City_Years = full.Stay_In_Current_City_Years.map({'0':0.'1':1."2":2."3":3.'4 +':4}).astype(int)
    # Feature generation
    
    # Average sales per unit
    avg_purchase_per_product=pd.DataFrame(full.groupby(['Product_ID'[])'Purchase'].mean())
    avg_purchase_per_product.reset_index(inplace=True)
    # Average sales per user
    avg_purchase_per_user=pd.DataFrame(full.groupby(['User_ID'[])'Purchase'].mean())
    avg_purchase_per_user.reset_index(inplace=True)   
    # Times of purchase
    product_count=pd.DataFrame(full['Product_ID'].value_counts())
    product_count.reset_index(inplace=True)
    product_count=product_count.rename(columns={'index':'Product_ID'.'Product_ID':'Product_count'})
    # Add three new features
    full['avg_purchase_per_product']=full['Product_ID'].map(avg_purchase_per_product.set_index('Product_ID') ['Purchase'])
    full['product_count']=full['Product_ID'].map(product_count.set_index('Product_ID') ['Product_count'])
    full['avg_purchase_per_user']=full['User_ID'].map(avg_purchase_per_user.set_index('User_ID') ['Purchase'])
    # Count invisible commodity category losses as a feature
    conditions = [
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] = =0) & (full['Product_Category_3'] = =0),
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] = =0),
    (full['Product_Category_1'] != 0) & (full['Product_Category_2'] != 0) & (full['Product_Category_3'] != 0)] 
    choices = [1.2.3]
    Add item category count
    full['Category_Count'] = np.select(conditions, choices, default=0)

    return full
Read the data set
train = pd.read_csv("./product_data.csv")
# Data cleaning and processing
data = Data_Clearing(train)
Copy the code

Model selection

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
import time

Prepare the evaluation functions RMSE and R2. The smaller the RMSE, the better, and the closer R2 to 1, the better
from sklearn.metrics import mean_squared_error as MSE ,r2_score as R2

# Evaluation function
def print_metrics(predict) :
    mse = MSE(y_true=y_test,y_pred=predict)
    rmse = np.sqrt(MSE(y_true=y_test,y_pred=predict))
    r2 = R2(y_true=y_test,y_pred=predict)
    print("MSE:",mse)
    print(RMSE: "",rmse)
    print(R2: "",r2)
    
# Divide training set and test set
X=train.drop('Purchase',axis=1)
y=train['Purchase']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,shuffle=True)

models = [LinearRegression(),
          Ridge(),
          Lasso(alpha=0.01,max_iter=10000),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          ElasticNet(),
          SGDRegressor(),
          BayesianRidge(),
          ExtraTreesRegressor(),
          XGBRegressor()]
# define the storage list
pred_df = pd.DataFrame()
pred_df["predict"] = y_test
rmses = []
r2s = []

all = time.clock()
# ======================== Calculate the time
names = ["LR"."Ridge"."Lasso"."RF"."GBR"."Ela"."SGD"."Bay"."Extra"."XGB"]
for name, model in zip(names, models):
    pre_start = time.clock()
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    print("%s" % name)
    # Training model
    now_model = model.fit(X,y)
    model_predict = now_model.predict(X_test)
    # Get rmSE and R2 scores
    rmse = np.sqrt(MSE(y_true=y_test,y_pred=model_predict))
    r2 = R2(y_true=y_test,y_pred=model_predict)
    # Save RMSE and R2, and the predicted results of the model
    pred_df["predict_"+name] = model_predict
    rmses.append(rmse)
    r2s.append(r2)
    Print the result
    print_metrics(model_predict)   
    elapsed = (time.clock() - pre_start)
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    print("Time used:",elapsed)
    print("-"*20)
# ======================== Calculate the time
print("-"*20)
elapsed = (time.clock() - all)
print("Time used:",elapsed)
Copy the code

Rmse and R2 obtained from SGD model are too large, which seriously interferes with drawing. We can modify the value of SGD to reflect the poor SGD effect

site = rmses.index(max(rmses))
rmses[site] = 4999
r2s[site] = -2.0

# Add canvas, set canvas size
plt.figure(figsize=(10.8),dpi=600)

plt.barh(range(len(rmses)),rmses,height=0.7, color='salmon', alpha=0.8) # Draw from the bottom up
plt.yticks(range(len(rmses)),names)
plt.xlim(0.5500) # 30 ~ 47, each 1

plt.xlabel(u"RMSE")
plt.ylabel(u'Models')
plt.title(u"RMSE")

# This is the value above the set bar, from RMSEs
for x_,y_ in enumerate(rmses):
    plt.text(y_,x_-0.1.'%s' % y_)

plt.show()
Copy the code

Model integration

from sklearn import ensemble,linear_model
from mlxtend.regressor import StackingCVRegressor,StackingRegressor
# Stacking strategy for model blending

# the first layer
clf1_1 = ensemble.RandomForestRegressor()
clf1_2 = ensemble.ExtraTreesRegressor()
The second floor #
clf2 = XGBRegressor()

np.random.seed(42)
stack = StackingRegressor(regressors=[clf1_1,clf1_2],meta_regressor=clf2)

start = time.clock()
# ======================== Calculate the time
print("%s" % "stack")
names.append("stack")

now_model = stack.fit(X,y)
model_predict = now_model.predict(X_test)

rmse = np.sqrt(MSE(y_true=y_test,y_pred=model_predict))
r2 = R2(y_true=y_test,y_pred=model_predict)

pred_df["predict_stack"] = model_predict
rmses.append(rmse)
r2s.append(r2)

print_metrics(model_predict)
print("-"*20)
# ======================== Calculate the time
elapsed = (time.clock() - start)
print("Time used:",elapsed)
Copy the code

# Add canvas, set canvas size
plt.figure(figsize=(12.8),dpi=600)

plt.barh(range(11), rmses, height=0.7, color='salmon', alpha=0.8) # Draw from the bottom up
plt.yticks(range(11), names)
plt.xlim(0.5500) # 30 ~ 47, each 1

plt.xlabel(u"RMSE")
plt.ylabel(u'Models')
plt.title(u"RMSE")

# This is the value above the set bar, from RMSEs
for x_,y_ in enumerate(rmses):
    plt.text(y_+0.025, x_-0.1.'%s' % y_)

plt.show()
Copy the code

# Add canvas, set canvas size
plt.figure(figsize=(12.8),dpi=600)

plt.barh(range(11), r2s, height=0.7, color='salmon', alpha=0.8) # Draw from the bottom up
plt.yticks(range(11), names)
plt.xlim(-2.5.2.5) # 30 ~ 47, each 1

plt.xlabel(u"R2")
plt.ylabel(u'Models')
plt.title(u"R2")

# This is the value above the set bar, from RMSEs
for x_,y_ in enumerate(r2s):
    plt.text(y_+0.025,x_-0.1.'%s' % y_)

plt.show()
Copy the code

As you can see, the sales projections are pretty close