Machine learning from zero to sklearn ·0.1.1· Linear fitting
- code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# introduction package
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
class Args:
seed=1234
data_file="sample_data.csv"
num_samples=100
train_size=0.75
test_size=0.25
num_epochs=100
args = Args()
# Set random seeds to ensure reproducibility of results
np.random.seed(args.seed)
# Generate data
def generate_data(num_samples) :
X = np.array(range(num_samples))
random_noise = np.random.uniform(-20.20,size=num_samples)
y = 3.65*X + 10 + random_noise # add some noise
return X, y
Generate random but linear data
X, y = generate_data(args.num_samples)
data = np.vstack([X, y]).T
df = pd.DataFrame(data, columns=['X'.'y'])
print(df.head())
Divide the data set
X_train, X_test, y_train, y_test = train_test_split(
df["X"].values.reshape(-1.1), df["y"], test_size=args.test_size,
random_state=args.seed)
print ("X_train:", X_train.shape)
print ("y_train:", y_train.shape)
print ("X_test:", X_test.shape)
print ("y_test:", y_test.shape)
# Standardized training set
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1.1))
# Standardize on training set and test set respectively
standardized_X_train = X_scaler.transform(X_train)
standardized_y_train = y_scaler.transform(y_train.values.reshape(-1.1)).ravel()
standardized_X_test = X_scaler.transform(X_test)
standardized_y_test = y_scaler.transform(y_test.values.reshape(-1.1)).ravel()
print ("mean:", np.mean(standardized_X_train, axis=0),
np.mean(standardized_y_train, axis=0)) # mean should be ~0
print ("std:", np.std(standardized_X_train, axis=0),
np.std(standardized_y_train, axis=0)) # std should be 1
lm = SGDRegressor(loss="squared_loss", penalty="none", max_iter=args.num_epochs)
# lm model
print(lm)
lm.fit(X=standardized_X_train, y=standardized_y_train)
pred_train = (lm.predict(standardized_X_train) * np.sqrt(y_scaler.var_)) + y_scaler.mean_
pred_test = (lm.predict(standardized_X_test) * np.sqrt(y_scaler.var_)) + y_scaler.mean_
# Resize the graph
plt.figure(figsize=(16.9))
# Draw the training data
plt.subplot(1.2.1)
plt.title("Train")
plt.scatter(X_train, y_train, label="y_train")
plt.plot(X_train, pred_train, color="red", linewidth=1, linestyle="-", label="lm")
plt.legend(loc='lower right')
# Draw the test data
plt.subplot(1.2.2)
plt.title("Test")
plt.scatter(X_test, y_test, label="y_test")
plt.plot(X_test, pred_test, color="red", linewidth=1, linestyle="-", label="lm")
plt.legend(loc='lower right')
plt.show()
Copy the code
- bash out:
X y 0 1 1.0 18.534351 0.0 2.339222 2, 2.0 14.809110 3.0 32.364343 4.0 35.799032 X_train: 3 (75, 1) y_train: (75,) X_test: (25, 1) Y_test: (25, 1) mean: [8.22952817E-17] 3.1798637796972193E-16 STD: [1.] 1.0000000000000002 SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1, eTA0 =0.01, Fit_intercept = True, l1_ratio = 0.15, learning_rate ='invscaling', loss='squared_loss', max_iter=100,
n_iter=None, n_iter_no_change=5, penalty='none', power_t=0.25, random_state=None, shuffle=True, TOl =None, validation_fraction=0.1, verbose=0, warm_start=False)Copy the code
- image out: