日志

第二章监督学习

已有 159 次阅读2021-7-9 10:41 |个人分类:python机器学习| 机器学习

#不同n_neighbors值得k临近模型的决策边界

import matplotlib.pyplot as plt

import mglearn

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

x,y = mglearn.datasets.make_forge()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

fig,axes = plt.subplots(1,3)

for n_neighbors,ax in zip([1,3,9],axes):

clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(x,y)

mglearn.plots.plot_2d_separator(clf,x,fill=True,eps=0.5,ax=ax,alpha=.4)

mglearn.discrete_scatter(x[:,0],x[:,1],y,ax=ax)

ax.set_title("{} neighbor(s)".format(n_neighbors))

ax.set_xlabel("feature 0")

ax.set_ylabel("feature 1")

axes[0].legend(loc=3)

plt.show()

#以n_neighbors为自变量，对比训练集和测试集的精度

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

cancer = load_breast_cancer()

x_train,x_test,y_train,y_test = train_test_split(

cancer.data,cancer.target,stratify = cancer.target, random_state=66)

training_accuracy = []

test_accuracy = []

#n_neighbors取值从1到10

neighbors_settings = range(1,11)

for n_neighbors in neighbors_settings:

#构建模型

clf = KNeighborsClassifier(n_neighbors=n_neighbors)

clf.fit(x_train,y_train)

#记录训练集精度

training_accuracy.append(clf.score(x_train,y_train))

#记录泛化精度

test_accuracy.append(clf.score(x_test,y_test))

plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")

plt.plot(neighbors_settings, test_accuracy,label = "teat accuracy")

plt.ylabel("Accuracy")

plt.xlabel("n_neighbors")

plt.legend()

plt.show()

# k近邻

import matplotlib.pyplot as plt

import mglearn

mglearn.plots.plot_knn_regression(n_neighbors=3) #3个近邻回归

mglearn.plots.plot_knn_regression(n_neighbors=1)#单一近邻回归

plt.show()

# k近邻在scikit-learn的KNeighborsRegressor中实现

from sklearn.neighbors import KNeighborsRegressor

import mglearn

from sklearn.model_selection import train_test_split

x,y = mglearn.datasets.make_wave(n_samples = 40)

#将wave数据集分为训练集和测试集

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

#模型实例化，并将邻居个数设为3

reg = KNeighborsRegressor(n_neighbors=3)

#利用训练数据和训练目标值来拟合模型

reg.fit(x_train,y_train)

print("Test set predictions:\n{}".format(reg.predict(x_test)))

print("\n")

print("Test set R^2:{:.2f}".format(reg.score(x_test,y_test)))

输出：

Test set predictions:

[-0.05396539 0.35686046 1.13671923 -1.89415682 -1.13881398 -1.63113382

0.35686046 0.91241374 -0.44680446 -1.13881398]

Test set R^2:0.83

#分析KneighborsRegressor

from sklearn.neighbors import KNeighborsRegressor

import mglearn

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import numpy as np

x,y = mglearn.datasets.make_wave(n_samples = 40)

#将wave数据集分为训练集和测试集

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

fig,axes = plt.subplots(1,3)

#创建1000个数据点，在-3和3之间均匀分布

line = np.linspace(-3,3,1000).reshape(-1,1)

for n_neighbors,ax in zip([1,3,9], axes):

#利用1,3,9个邻居分别进行预测

reg = KNeighborsRegressor(n_neighbors = n_neighbors)

reg.fit(x_train,y_train)

ax.plot(line,reg.predict(line))

ax.plot(x_train,y_train,"^",c=mglearn.cm2(0),markersize=8)

ax.plot(x_test,y_test,"v" ,c=mglearn.cm2(1),markersize=8)

ax.set_title(

"{} neighbor(s)\n train score:{:.2f} test score:{:.2f}".format(

n_neighbors,

reg.score(x_train,y_train),

reg.score(x_test,y_test)))

ax.set_xlabel("Feature")

ax.set_ylabel("Target")

axes[0].legend(["Model predictions","Training data/target",

"Test data/target"],loc="best")

plt.show()

#用于回归的线性模型

import mglearn

import matplotlib.pyplot as plt

mglearn.plots.plot_linear_regression_wave()

plt.show()

输出：w[0]: 0.393906 b: -0.031804

#线性回归（最小二乘法）

#欠拟合

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

import mglearn

x,y = mglearn.datasets.make_wave(n_samples=60)

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=41)

lr = LinearRegression().fit(x_train,y_train)

print("斜率","lr.coef_:{}".format(lr.coef_))

print("截距","lr.intercept_:{}".format(lr.intercept_))

#查看训练集和测试集性能

print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))

print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))

输出：

斜率 lr.coef_:[0.4548843]

截距 lr.intercept_:-0.06621850388598466

Training set score:0.64

Test set score:0.74

#线性回归，使用复杂的波士顿房价数据集

#过拟合

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

import mglearn

x,y = mglearn.datasets.load_extended_boston()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lr = LinearRegression().fit(x_train,y_train)

print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))

print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))

输出：

Training set score:0.95

Test set score:0.61

# 岭回归

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

import mglearn

x,y = mglearn.datasets.load_extended_boston()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

from sklearn.linear_model import Ridge

ridge = Ridge().fit(x_train,y_train)

print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))

print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))

# 岭回归

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

import mglearn

import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge

x,y = mglearn.datasets.load_extended_boston()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lr = LinearRegression().fit(x_train,y_train)

ridge = Ridge().fit(x_train,y_train)

print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))

print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))

ridge10 = Ridge(alpha = 10).fit(x_train,y_train)#alpha=1.0默认

print("="*10)

print("TRaining set score:{:.2f}".format(ridge10.score(x_train,y_train)))

print("Test set score:{:.2f}".format(ridge10.score(x_test,y_test)))

ridge01 = Ridge(alpha = 0.1).fit(x_train,y_train)#alpha=1.0默认

print("="*10)

print("TRaining set score:{:.2f}".format(ridge01.score(x_train,y_train)))

print("Test set score:{:.2f}".format(ridge01.score(x_test,y_test)))

plt.plot(ridge.coef_,"s",label = "Ridge alpha =1")

plt.plot(ridge10.coef_,"s",label = "Ridge alpha =10")

plt.plot(ridge01.coef_,"s",label = "Ridge alpha =01")

plt.plot(lr.coef_,"o",label="LinearRegression")

plt.xlabel("Coefficient index")

plt.ylabel("Coefficient magnitude")

plt.hlines(0,0,len(lr.coef_))

plt.ylim(-25,25)

plt.legend()

plt.show()

绘制学习曲线：

mglearn.plots.plot_ridge_n_samples()

# lasso回归

from sklearn.linear_model import Lasso

from sklearn.model_selection import train_test_split

import mglearn

import matplotlib.pyplot as plt

import numpy as np

x,y = mglearn.datasets.load_extended_boston()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lasso = Lasso().fit(x_train, y_train)

print("Training set score: {:.2f}".format(lasso.score(x_train, y_train)))

print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))

print("Number of features used:", np.sum(lasso.coef_ != 0))

输出：

Training set score: 0.29

Test set score: 0.21

Number of features used: 4

lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)

print("Training set score: {:.2f}".format(lasso001.score(x_train, y_train)))

print("Test set score: {:.2f}".format(lasso001.score(x_test, y_test)))

print("Number of features used:", np.sum(lasso001.coef_ != 0))

输出：

Training set score: 0.90

Test set score: 0.77

Number of features used: 33

lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)

print("Training set score: {:.2f}".format(lasso00001.score(x_train, y_train)))

print("Test set score: {:.2f}".format(lasso00001.score(x_test, y_test)))

print("Number of features used:", np.sum(lasso00001.coef_ != 0))

输出：

Training set score: 0.95

Test set score: 0.64

Number of features used: 96