#不同n_neighbors值得k临近模型的决策边界
import matplotlib.pyplot as plt
import mglearn
from sklearn.neighbors import
KNeighborsClassifier
from sklearn.model_selection import
train_test_split
x,y = mglearn.datasets.make_forge()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
fig,axes = plt.subplots(1,3)
for n_neighbors,ax in zip([1,3,9],axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(x,y)
mglearn.plots.plot_2d_separator(clf,x,fill=True,eps=0.5,ax=ax,alpha=.4)
mglearn.discrete_scatter(x[:,0],x[:,1],y,ax=ax)
ax.set_title("{} neighbor(s)".format(n_neighbors))
ax.set_xlabel("feature 0")
ax.set_ylabel("feature 1")
axes[0].legend(loc=3)
plt.show()

#以n_neighbors为自变量,对比训练集和测试集的精度
from sklearn.datasets import
load_breast_cancer
from sklearn.model_selection import
train_test_split
from sklearn.neighbors import
KNeighborsClassifier
import matplotlib.pyplot as plt
cancer = load_breast_cancer()
x_train,x_test,y_train,y_test =
train_test_split(
cancer.data,cancer.target,stratify = cancer.target, random_state=66)
training_accuracy = []
test_accuracy = []
#n_neighbors取值从1到10
neighbors_settings = range(1,11)
for n_neighbors in neighbors_settings:
#构建模型
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(x_train,y_train)
#记录训练集精度
training_accuracy.append(clf.score(x_train,y_train))
#记录泛化精度
test_accuracy.append(clf.score(x_test,y_test))
plt.plot(neighbors_settings,
training_accuracy, label = "training accuracy")
plt.plot(neighbors_settings, test_accuracy,label
= "teat accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.show()

# k近邻
import matplotlib.pyplot as plt
import mglearn
mglearn.plots.plot_knn_regression(n_neighbors=3) #3个近邻回归
mglearn.plots.plot_knn_regression(n_neighbors=1)#单一近邻回归
plt.show()


# k近邻在scikit-learn的KNeighborsRegressor中实现
from sklearn.neighbors import KNeighborsRegressor
import mglearn
from sklearn.model_selection import train_test_split
x,y = mglearn.datasets.make_wave(n_samples = 40)
#将wave数据集分为训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
#模型实例化,并将邻居个数设为3
reg = KNeighborsRegressor(n_neighbors=3)
#利用训练数据和训练目标值来拟合模型
reg.fit(x_train,y_train)
print("Test set predictions:\n{}".format(reg.predict(x_test)))
print("\n")
print("Test set R^2:{:.2f}".format(reg.score(x_test,y_test)))
输出:
Test set predictions:
[-0.05396539 0.35686046 1.13671923 -1.89415682 -1.13881398 -1.63113382
0.35686046 0.91241374 -0.44680446 -1.13881398]
Test set R^2:0.83
#分析KneighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
import mglearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
x,y = mglearn.datasets.make_wave(n_samples = 40)
#将wave数据集分为训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
fig,axes = plt.subplots(1,3)
#创建1000个数据点,在-3和3之间均匀分布
line = np.linspace(-3,3,1000).reshape(-1,1)
for n_neighbors,ax in zip([1,3,9], axes):
#利用1,3,9个邻居分别进行预测
reg = KNeighborsRegressor(n_neighbors = n_neighbors)
reg.fit(x_train,y_train)
ax.plot(line,reg.predict(line))
ax.plot(x_train,y_train,"^",c=mglearn.cm2(0),markersize=8)
ax.plot(x_test,y_test,"v" ,c=mglearn.cm2(1),markersize=8)
ax.set_title(
"{} neighbor(s)\n train score:{:.2f} test score:{:.2f}".format(
n_neighbors,
reg.score(x_train,y_train),
reg.score(x_test,y_test)))
ax.set_xlabel("Feature")
ax.set_ylabel("Target")
axes[0].legend(["Model predictions","Training data/target",
"Test data/target"],loc="best")
plt.show()
#用于回归的线性模型
import mglearn
import matplotlib.pyplot as plt
mglearn.plots.plot_linear_regression_wave()
plt.show()
输出:w[0]: 0.393906 b: -0.031804
#线性回归(最小二乘法)
#欠拟合
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
x,y = mglearn.datasets.make_wave(n_samples=60)
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=41)
lr = LinearRegression().fit(x_train,y_train)
print("斜率","lr.coef_:{}".format(lr.coef_))
print("截距","lr.intercept_:{}".format(lr.intercept_))
#查看训练集和测试集性能
print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))
输出:
斜率 lr.coef_:[0.4548843]
截距 lr.intercept_:-0.06621850388598466
Training set score:0.64
Test set score:0.74
#线性回归,使用复杂的波士顿房价数据集
#过拟合
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lr = LinearRegression().fit(x_train,y_train)
print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))
输出:
Training set score:0.95
Test set score:0.61
# 岭回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
from sklearn.linear_model import Ridge
ridge = Ridge().fit(x_train,y_train)
print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))
# 岭回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lr = LinearRegression().fit(x_train,y_train)
ridge = Ridge().fit(x_train,y_train)
print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))
ridge10 = Ridge(alpha = 10).fit(x_train,y_train)#alpha=1.0默认
print("="*10)
print("TRaining set score:{:.2f}".format(ridge10.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge10.score(x_test,y_test)))
ridge01 = Ridge(alpha = 0.1).fit(x_train,y_train)#alpha=1.0默认
print("="*10)
print("TRaining set score:{:.2f}".format(ridge01.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge01.score(x_test,y_test)))
plt.plot(ridge.coef_,"s",label = "Ridge alpha =1")
plt.plot(ridge10.coef_,"s",label = "Ridge alpha =10")
plt.plot(ridge01.coef_,"s",label = "Ridge alpha =01")
plt.plot(lr.coef_,"o",label="LinearRegression")
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.hlines(0,0,len(lr.coef_))
plt.ylim(-25,25)
plt.legend()
plt.show()
绘制学习曲线:
mglearn.plots.plot_ridge_n_samples()
# lasso回归
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
import numpy as np
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lasso = Lasso().fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))
输出:
Training set score: 0.29
Test set score: 0.21
Number of features used: 4
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso001.coef_ != 0))
输出:
Training set score: 0.90
Test set score: 0.77
Number of features used: 33
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso00001.coef_ != 0))
输出:
Training set score: 0.95
Test set score: 0.64
Number of features used: 96
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Ridge
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lasso = Lasso().fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso001.coef_ != 0))
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso00001.coef_ != 0))
plt.plot(lasso.coef_, 's', label="Lasso alpha=1")
plt.plot(lasso001.coef_, '^', label="Lasso alpha=0.01")
plt.plot(lasso00001.coef_, 'v', label="Lasso alpha=0.0001")
ridge01 = Ridge(alpha = 0.1).fit(x_train,y_train)#alpha=1.0默认
plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.legend(ncol=2, loc=(0, 1.05))
plt.ylim(-25, 25)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.show()
输出:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import mglearn
import matplotlib.pyplot as plt
x, y = mglearn.datasets.make_forge()
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
clf = model.fit(x, y)
mglearn.plots.plot_2d_separator(clf, x, fill=False, eps=0.5,
ax=ax, alpha=.7)
mglearn.discrete_scatter(x[:, 0], x[:, 1], y, ax=ax)
ax.set_title(clf.__class__.__name__)
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
axes[0].legend()
mglearn.plots.plot_linear_svc_regularization()
plt.show()
输出:


from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
logreg = LogisticRegression().fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(x_test, y_test)))
logreg100 = LogisticRegression(C=100).fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg100.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg100.score(x_test, y_test)))
logreg001 = LogisticRegression(C=0.01).fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg001.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg001.score(x_test, y_test)))
plt.plot(logreg.coef_.T, 'o', label="C=1")
plt.plot(logreg100.coef_.T, '^', label="C=100")
plt.plot(logreg001.coef_.T, 'v', label="C=0.001")
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.ylim(-5, 5)
plt.xlabel("Feature")
plt.ylabel("Coefficient magnitude")
plt.legend()
plt.show()
输出:
Training set score: 0.946
Test set score: 0.958

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
for C, marker in zip([0.001, 1, 100], ['o', '^', 'v']):
lr_l1 = LogisticRegression(C=C, solver='liblinear', penalty="l1").fit(x_train, y_train)
print("Training accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
C, lr_l1.score(x_train, y_train)))
print("Test accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
C, lr_l1.score(x_test, y_test)))
plt.plot(lr_l1.coef_.T, marker, label="C={:.3f}".format(C))
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.xlabel("Feature")
plt.ylabel("Coefficient magnitude")
plt.ylim(-5, 5)
plt.legend(loc=3)
plt.show()
输出:
Training accuracy of l1 logreg with C=0.001: 0.91
Test accuracy of l1 logreg with C=0.001: 0.92
Training accuracy of l1 logreg with C=1.000: 0.96
Test accuracy of l1 logreg with C=1.000: 0.96
Training accuracy of l1 logreg with C=100.000: 0.99
Test accuracy of l1 logreg with C=100.000: 0.98
#多分类线性模型
import matplotlib.pyplot as plt
import mglearn
from sklearn.datasets import make_blobs
X, y = make_blobs(random_state=42)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(["Class 0", "Class 1", "Class 2"])
plt.show()

#多分类线性模型
import matplotlib.pyplot as plt
import mglearn
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import numpy as np
X, y = make_blobs(random_state=42)
linear_svm = LinearSVC().fit(X, y)
print("Coefficient shape: ", linear_svm.coef_.shape)
print("Intercept shape: ", linear_svm.intercept_.shape)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
mglearn.cm3.colors):
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
'Line class 2'], loc=(1.01, 0.3))
输出:
Coefficient shape: (3, 2)
Intercept shape: (3,)
#多分类线性模型
import matplotlib.pyplot as plt
import mglearn
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import numpy as np
X, y = make_blobs(random_state=42)
linear_svm = LinearSVC().fit(X, y)
print("Coefficient shape: ", linear_svm.coef_.shape)
print("Intercept shape: ", linear_svm.intercept_.shape)
mglearn.plots.plot_2d_classification(linear_svm, X, fill=True, alpha=.7)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
mglearn.cm3.colors):
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
'Line class 2'], loc=(1.01, 0.3))
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

# 朴素贝叶斯分类器
# GaussianNB主要用于高维数据,MultinomialNB和BernoulliNB用于稀疏计数数据。
import numpy as np
X = np.array([[0, 1, 0, 1],
[1, 0, 1, 1],
[0, 0, 0, 1],
[1, 0, 1, 0]])
y = np.array([0, 1, 0, 1])
counts = {}
for label in np.unique(y):
# iterate over each class
# count (sum) entries of 1 per feature
print("y == label:",y == label)
print("X[y == label]\n",X[y == label])
print("sum:",X[y == label].sum(axis=0))
counts[label] = X[y == label].sum(axis=0)
print("Feature counts:\n", counts)
输出:
y == label: [ True False True False]
X[y == label]
[[0 1 0 1]
[0 0 0 1]]
sum: [0 1 0 2]
y == label: [False True False True]
X[y == label]
[[1 0 1 1]
[1 0 1 0]]
sum: [2 0 2 1]
Feature counts:
{0: array([0, 1, 0, 2]), 1: array([2, 0, 2, 1])}
#决策树
# 决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
输出:
Accuracy on training set: 1.000
Accuracy on test set: 0.937
#深度设置为4
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
输出:
Accuracy on training set: 0.988
Accuracy on test set: 0.951
##可视化决策树
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"],
feature_names=cancer.feature_names, impurity=False, filled=True)
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
print(graphviz.Source(dot_graph))
决策树可视化输出没有做出来!~
#树的特征重要性
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("Feature importances:")
print(tree.feature_importances_)
输出:
Feature importances:
[0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0.01019737 0.04839825
0. 0. 0.0024156 0. 0. 0.
0. 0. 0.72682851 0.0458159 0. 0.
0.0141577 0. 0.018188 0.1221132 0.01188548 0. ]
#特征重要性输出
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("Feature importances:")
print(tree.feature_importances_)
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(np.arange(n_features),
model.feature_importances_,
align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
plot_feature_importances_cancer(tree)
plt.show()

tree = mglearn.plots.plot_tree_not_monotone()
print(tree)
输出:
Feature importances: [0. 1.]
digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="X[1] <= -5.814\nsamples = 100\nvalue = [50, 50]", fillcolor="#ffffff"] ;
1 [label="samples = 25\nvalue = [25, 0]", fillcolor="#e58139"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="X[1] <= 5.348\nsamples = 75\nvalue = [25, 50]", fillcolor="#9ccef2"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label="samples = 50\nvalue = [0, 50]", fillcolor="#399de5"] ;
2 -> 3 ;
4 [label="samples = 25\nvalue = [25, 0]", fillcolor="#e58139"] ;
2 -> 4 ;
}
#
import pandas as pd
import os
import mglearn
import matplotlib.pyplot as plt
ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))
# 绘制对数坐标图
plt.semilogy(ram_prices.date, ram_prices.price)
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.show()

import os
import pandas as pd
import mglearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))
print(ram_prices)
# use historical data to forecast prices after the year 2000
data_train = ram_prices[ram_prices.date < 2000]
data_test = ram_prices[ram_prices.date >= 2000]
# predict prices based on date
X_train = data_train.date[:, np.newaxis]
# we use a log-transform to get a simpler relationship of data to target
y_train = np.log(data_train.price)
tree = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train)
linear_reg = LinearRegression().fit(X_train, y_train)
# predict on all data
X_all = ram_prices.date[:, np.newaxis]
pred_tree = tree.predict(X_all)
pred_lr = linear_reg.predict(X_all)
# undo log-transform
price_tree = np.exp(pred_tree)
price_lr = np.exp(pred_lr)
plt.semilogy(data_train.date, data_train.price, label="Training data")
plt.semilogy(data_test.date, data_test.price, label="Test data")
plt.semilogy(ram_prices.date, price_tree, label="Tree prediction")
plt.semilogy(ram_prices.date, price_lr, label="Linear prediction")
plt.legend()
plt.show()
numpy.newaxis使用import numpy as np
x = np.arange(3)
print("x[np.newaxis]\n",x[np.newaxis])
print("x[np.newaxis,np.newaxis]\n",x[np.newaxis,np.newaxis])
print("x[:, np.newaxis]\n",x[:, np.newaxis])
print("x[:, np.newaxis, np.newaxis]\n",x[:, np.newaxis, np.newaxis])
print("*"*10)
y = np.arange(12).reshape(4,3)
print("y:\n",y)
print("y[:, 1]\n",y[:, 1])
print("y[:, 1, np.newaxis]\n",y[:, 1, np.newaxis])
print("y[1, :, np.newaxis]\n",y[1, :, np.newaxis])
输出:
x[np.newaxis]
[[0 1 2]]
x[np.newaxis,np.newaxis]
[[[0 1 2]]]
x[:, np.newaxis]
[[0]
[1]
[2]]
x[:, np.newaxis, np.newaxis]
[[[0]]
[[1]]
[[2]]]
**********
y:
[[ 0 1 2]
[ 3 4 5]
[ 6 7 8]
[ 9 10 11]]
y[:, 1]
[ 1 4 7 10]
y[:, 1, np.newaxis]
[[ 1]
[ 4]
[ 7]
[10]]
y[1, :, np.newaxis]
[[3]
[4]
[5]]
enumerate函数:(输出索引值)
seasons = ['Spring', 'Summer', 'Fall', 'Winter']
print(list(enumerate(seasons)))
print(list(enumerate(seasons, start=1))) # 下标从 1 开始
输出:
[(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
[(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]
zip的用法:
a1=[1,2,3]
a2=[4,5,6]
a3=[7,8,9]
a4=["a","b","c","d"]
zip1=zip(a1,a2,a3)
print(zip1)
for i in zip1:
print(i)
输出:
<zip object at 0x000001D1CA798600>
(1, 4, 7)
(2, 5, 8)
(3, 6, 9)
# ravel()用法 将数组维度拉成一维数组
import numpy as np #导入numpy模块
a, b = np.mgrid[1:4:1, 2:3:1] #用mgrid()方法生成等差数组a,b
print("a:",a,"\n","b:",b) #打印a,b
Aftera = a.ravel() #用ravel()方法将数组a拉成一维数组
print("Aftera:",Aftera) #打印a
输出:
a: [[1]
[2]
[3]]
b: [[2]
[2]
[2]]
Aftera: [1 2 3]
#随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn
X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
random_state=42)
#5棵树
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)
# print(forest.estimators_)
fig, axes = plt.subplots(2, 3, figsize=(20, 10))
for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
ax.set_title("Tree {}".format(i))
mglearn.plots.plot_tree_partition(X_train, y_train, tree, ax=ax)
mglearn.plots.plot_2d_separator(forest,
X_train,
fill=True,
ax=axes[-1, -1],
alpha=.4)
axes[-1, -1].set_title("Random Forest")
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.show()

print(i,ax,tree)
输出:
0 AxesSubplot(0.125,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=1872583848)
1 AxesSubplot(0.398529,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=794921487)
2 AxesSubplot(0.672059,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=111352301)
3 AxesSubplot(0.125,0.11;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=1853453896)
4 AxesSubplot(0.398529,0.11;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=213298710)
enumerate( )函数说明1.enumerate()是python的内置函数
2.enumerate在字典上是枚举、列举的意思
list1=['Lady','Man','Girl','Boy']
print(list(enumerate(list1)))
list1=['Lady','Man','Girl','Boy']
print(list(enumerate(list1,1)))
#字符串
a='abcde'
print(list(enumerate(a)))
#元组
tup=('A','B','C','D')
print(list(enumerate(tup,1)))
#字典(遍历key,而不是value)
dict={'A':2,'B':4,'C':8}
print(list(enumerate(dict,1)))
#普通的for循环
i = 0
seq = ['one', 'two', 'three']
for element in seq:
print (i, seq[i])
i +=1
print("*"*20,"\n")
#enumerate( )
seq = ['one', 'two', 'three']
for i, element in enumerate(seq):
print (i, element)
输出:
[(0, 'Lady'), (1, 'Man'), (2, 'Girl'), (3, 'Boy')]
[(1, 'Lady'), (2, 'Man'), (3, 'Girl'), (4, 'Boy')]
[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd'), (4, 'e')]
[(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D')]
[(1, 'A'), (2, 'B'), (3, 'C')]
0 one
1 two
2 three
********************
0 one
1 two
2 three
sklearn中的make_blobs的用法
data, label = make_blobs(n_features=2, n_samples=100, centers=3, random_state=3, cluster_std=[0.8, 2, 5])
- n_features表示每一个样本有多少特征值(默认2个)
- n_samples表示样本的个数,也即样本总数(默认100个)
- centers是聚类中心点的个数,可以理解为label的种类数(默认3个)
- random_state是随机种子,可以固定生成的数据 (default=None)
- cluster_std设置每个类别的方差(默认1.0)
#梯度提升回归树
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
#核支持向量机
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import mglearn
X, y = make_blobs(centers=4, random_state=8)
y = y % 2
print(X.shape,y.shape)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import mglearn
import matplotlib.pyplot as plt
X, y = make_blobs(centers=4, random_state=8)
y = y % 2
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
linear_svm = LinearSVC().fit(X, y)
mglearn.plots.plot_2d_separator(linear_svm, X)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()
np.hstack将参数元组的元素数组按水平方向进行叠加
import numpy as np
arr1 = np.array([[1,3], [2,4] ])
arr2 = np.array([[1,4], [2,6] ])
res = np.hstack((arr1, arr2))
print(arr1)
print(arr2)
print (res)
输出:
[[1 3]
[2 4]]
[[1 4]
[2 6]]
[[1 3 1 4]
[2 4 2 6]]