立即注册 登录
气象家园 返回首页

主啊的个人空间 https://bbs.06climate.com/?69999 [收藏] [复制] [分享] [RSS]

日志

第二章 监督学习

已有 49 次阅读2021-7-9 10:41 |个人分类:python机器学习| 机器学习

#不同n_neighbors值得k临近模型的决策边界

import matplotlib.pyplot as plt

import mglearn

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

x,y = mglearn.datasets.make_forge()

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

fig,axes = plt.subplots(1,3)

for n_neighbors,ax in zip([1,3,9],axes):

    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(x,y)

    mglearn.plots.plot_2d_separator(clf,x,fill=True,eps=0.5,ax=ax,alpha=.4)

    mglearn.discrete_scatter(x[:,0],x[:,1],y,ax=ax)

    ax.set_title("{} neighbor(s)".format(n_neighbors))

    ax.set_xlabel("feature 0")

    ax.set_ylabel("feature 1")

axes[0].legend(loc=3)

plt.show()




#以n_neighbors为自变量,对比训练集和测试集的精度

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

cancer = load_breast_cancer()

x_train,x_test,y_train,y_test = train_test_split(

    cancer.data,cancer.target,stratify = cancer.target, random_state=66)

training_accuracy = []

test_accuracy = []

#n_neighbors取值从110

neighbors_settings = range(1,11)

for n_neighbors in neighbors_settings:

    #构建模型

    clf = KNeighborsClassifier(n_neighbors=n_neighbors)

    clf.fit(x_train,y_train)

    #记录训练集精度

    training_accuracy.append(clf.score(x_train,y_train))

    #记录泛化精度

    test_accuracy.append(clf.score(x_test,y_test))

plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")

plt.plot(neighbors_settings, test_accuracy,label = "teat accuracy")

plt.ylabel("Accuracy")

plt.xlabel("n_neighbors")

plt.legend()

plt.show()


# k近邻

import matplotlib.pyplot as plt

import mglearn

mglearn.plots.plot_knn_regression(n_neighbors=3) #3个近邻回归

mglearn.plots.plot_knn_regression(n_neighbors=1)#单一近邻回归

plt.show()

# k近邻在scikit-learn的KNeighborsRegressor中实现

from sklearn.neighbors import KNeighborsRegressor

import mglearn

from sklearn.model_selection import train_test_split

x,y = mglearn.datasets.make_wave(n_samples = 40)

#将wave数据集分为训练集和测试集

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

#模型实例化,并将邻居个数设为3

reg = KNeighborsRegressor(n_neighbors=3)

#利用训练数据和训练目标值来拟合模型

reg.fit(x_train,y_train)

print("Test set predictions:\n{}".format(reg.predict(x_test)))

print("\n")

print("Test set R^2:{:.2f}".format(reg.score(x_test,y_test)))

输出:

Test set predictions:

[-0.05396539  0.35686046  1.13671923 -1.89415682 -1.13881398 -1.63113382

  0.35686046  0.91241374 -0.44680446 -1.13881398]


Test set R^2:0.83




#分析KneighborsRegressor

from sklearn.neighbors import KNeighborsRegressor

import mglearn

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import numpy as np


x,y = mglearn.datasets.make_wave(n_samples = 40)

#将wave数据集分为训练集和测试集

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)


fig,axes = plt.subplots(1,3)


#创建1000个数据点,在-3和3之间均匀分布

line = np.linspace(-3,3,1000).reshape(-1,1)


for n_neighbors,ax in zip([1,3,9], axes):

    #利用1,3,9个邻居分别进行预测

    reg = KNeighborsRegressor(n_neighbors = n_neighbors)

    reg.fit(x_train,y_train)

    

    ax.plot(line,reg.predict(line))

    ax.plot(x_train,y_train,"^",c=mglearn.cm2(0),markersize=8)

    ax.plot(x_test,y_test,"v" ,c=mglearn.cm2(1),markersize=8)

    

    ax.set_title(

        "{} neighbor(s)\n train score:{:.2f} test score:{:.2f}".format(

            n_neighbors,

            reg.score(x_train,y_train),

            reg.score(x_test,y_test)))

    ax.set_xlabel("Feature")

    ax.set_ylabel("Target")

    axes[0].legend(["Model predictions","Training data/target",

                   "Test data/target"],loc="best")

plt.show()

    

#用于回归的线性模型
import mglearn
import matplotlib.pyplot as plt
mglearn.plots.plot_linear_regression_wave()
plt.show()
输出:w[0]: 0.393906  b: -0.031804

#线性回归(最小二乘法)
#欠拟合
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
x,y = mglearn.datasets.make_wave(n_samples=60)
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=41)
lr = LinearRegression().fit(x_train,y_train)
print("斜率","lr.coef_:{}".format(lr.coef_))
print("截距","lr.intercept_:{}".format(lr.intercept_))
#查看训练集和测试集性能
print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))

输出:
斜率 lr.coef_:[0.4548843]
截距 lr.intercept_:-0.06621850388598466
Training set score:0.64
Test set score:0.74



#线性回归,使用复杂的波士顿房价数据集
#过拟合
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn

x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lr = LinearRegression().fit(x_train,y_train)
print("Training set score:{:.2f}".format(lr.score(x_train,y_train)))
print("Test set score:{:.2f}".format(lr.score(x_test,y_test)))
输出:
Training set score:0.95
Test set score:0.61



# 岭回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
from sklearn.linear_model import Ridge
ridge = Ridge().fit(x_train,y_train)
print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))




# 岭回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lr = LinearRegression().fit(x_train,y_train)

ridge = Ridge().fit(x_train,y_train)
print("Train set score:{:.2f}".format(ridge.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(x_test,y_test)))

ridge10 = Ridge(alpha = 10).fit(x_train,y_train)#alpha=1.0默认
print("="*10)
print("TRaining set score:{:.2f}".format(ridge10.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge10.score(x_test,y_test)))


ridge01 = Ridge(alpha = 0.1).fit(x_train,y_train)#alpha=1.0默认
print("="*10)
print("TRaining set score:{:.2f}".format(ridge01.score(x_train,y_train)))
print("Test set score:{:.2f}".format(ridge01.score(x_test,y_test)))

plt.plot(ridge.coef_,"s",label = "Ridge alpha =1")
plt.plot(ridge10.coef_,"s",label = "Ridge alpha =10")
plt.plot(ridge01.coef_,"s",label = "Ridge alpha =01")

plt.plot(lr.coef_,"o",label="LinearRegression")

plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")

plt.hlines(0,0,len(lr.coef_))
plt.ylim(-25,25)

plt.legend()
plt.show()


绘制学习曲线:
mglearn.plots.plot_ridge_n_samples()



# lasso回归
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
import numpy as np

x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lasso = Lasso().fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

输出:
Training set score: 0.29
Test set score: 0.21
Number of features used: 4

lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso001.coef_ != 0))
输出:
Training set score: 0.90
Test set score: 0.77
Number of features used: 33


lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso00001.coef_ != 0))
输出:
Training set score: 0.95
Test set score: 0.64
Number of features used: 96

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Ridge


x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

lasso = Lasso().fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso.coef_ != 0))

lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso001.coef_ != 0))

lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(x_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(x_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(x_test, y_test)))
print("Number of features used:", np.sum(lasso00001.coef_ != 0))

plt.plot(lasso.coef_, 's', label="Lasso alpha=1")
plt.plot(lasso001.coef_, '^', label="Lasso alpha=0.01")
plt.plot(lasso00001.coef_, 'v', label="Lasso alpha=0.0001")

ridge01 = Ridge(alpha = 0.1).fit(x_train,y_train)#alpha=1.0默认
plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1")
plt.legend(ncol=2, loc=(0, 1.05))
plt.ylim(-25, 25)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")
plt.show()
输出:

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC

import mglearn

import matplotlib.pyplot as plt


x, y = mglearn.datasets.make_forge()

fig, axes = plt.subplots(1, 2, figsize=(10, 3))


for model, ax in zip([LinearSVC(), LogisticRegression()], axes):

    clf = model.fit(x, y)

    mglearn.plots.plot_2d_separator(clf, x, fill=False, eps=0.5,

                                    ax=ax, alpha=.7)

    mglearn.discrete_scatter(x[:, 0], x[:, 1], y, ax=ax)

    

    ax.set_title(clf.__class__.__name__)

    ax.set_xlabel("Feature 0")

    ax.set_ylabel("Feature 1")

axes[0].legend()

mglearn.plots.plot_linear_svc_regularization()

plt.show()

输出:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)
logreg = LogisticRegression().fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(x_test, y_test)))


logreg100 = LogisticRegression(C=100).fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg100.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg100.score(x_test, y_test)))


logreg001 = LogisticRegression(C=0.01).fit(x_train, y_train)
print("Training set score: {:.3f}".format(logreg001.score(x_train, y_train)))
print("Test set score: {:.3f}".format(logreg001.score(x_test, y_test)))


plt.plot(logreg.coef_.T, 'o', label="C=1")
plt.plot(logreg100.coef_.T, '^', label="C=100")
plt.plot(logreg001.coef_.T, 'v', label="C=0.001")
plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.ylim(-5, 5)
plt.xlabel("Feature")
plt.ylabel("Coefficient magnitude")
plt.legend()
plt.show()

输出:
Training set score: 0.946
Test set score: 0.958


import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)


for C, marker in zip([0.001, 1, 100], ['o', '^', 'v']):
    lr_l1 = LogisticRegression(C=C, solver='liblinear', penalty="l1").fit(x_train, y_train)
    print("Training accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
          C, lr_l1.score(x_train, y_train)))
    print("Test accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
          C, lr_l1.score(x_test, y_test)))
    plt.plot(lr_l1.coef_.T, marker, label="C={:.3f}".format(C))

plt.xticks(range(cancer.data.shape[1]), cancer.feature_names, rotation=90)
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
plt.xlim(xlims)
plt.xlabel("Feature")
plt.ylabel("Coefficient magnitude")

plt.ylim(-5, 5)
plt.legend(loc=3)
plt.show()
输出:
Training accuracy of l1 logreg with C=0.001: 0.91
Test accuracy of l1 logreg with C=0.001: 0.92
Training accuracy of l1 logreg with C=1.000: 0.96
Test accuracy of l1 logreg with C=1.000: 0.96
Training accuracy of l1 logreg with C=100.000: 0.99
Test accuracy of l1 logreg with C=100.000: 0.98

#多分类线性模型
import matplotlib.pyplot as plt
import mglearn

from sklearn.datasets import make_blobs

X, y = make_blobs(random_state=42)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(["Class 0", "Class 1", "Class 2"])
plt.show()

#多分类线性模型
import matplotlib.pyplot as plt
import mglearn
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import numpy as np
X, y = make_blobs(random_state=42)


linear_svm = LinearSVC().fit(X, y)
print("Coefficient shape: ", linear_svm.coef_.shape)
print("Intercept shape: ", linear_svm.intercept_.shape)

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
                                  mglearn.cm3.colors):
    plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
            'Line class 2'], loc=(1.01, 0.3))
输出:
Coefficient shape:  (3, 2)
Intercept shape:  (3,)

#多分类线性模型
import matplotlib.pyplot as plt
import mglearn
from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import numpy as np
X, y = make_blobs(random_state=42)

linear_svm = LinearSVC().fit(X, y)
print("Coefficient shape: ", linear_svm.coef_.shape)
print("Intercept shape: ", linear_svm.intercept_.shape)

mglearn.plots.plot_2d_classification(linear_svm, X, fill=True, alpha=.7)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
line = np.linspace(-15, 15)
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_,
                                  mglearn.cm3.colors):
    plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line class 0', 'Line class 1',
            'Line class 2'], loc=(1.01, 0.3))
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

# 朴素贝叶斯分类器

# GaussianNB主要用于高维数据,MultinomialNB和BernoulliNB用于稀疏计数数据。

import numpy as np

X = np.array([[0, 1, 0, 1],

              [1, 0, 1, 1],

              [0, 0, 0, 1],

              [1, 0, 1, 0]])

y = np.array([0, 1, 0, 1])

counts = {}

for label in np.unique(y):

    # iterate over each class

    # count (sum) entries of 1 per feature

    print("y == label:",y == label)

    print("X[y == label]\n",X[y == label])

    print("sum:",X[y == label].sum(axis=0))

    counts[label] = X[y == label].sum(axis=0)

print("Feature counts:\n", counts)

输出:

y == label: [ True False  True False]

X[y == label]

 [[0 1 0 1]

 [0 0 0 1]]

sum: [0 1 0 2]

y == label: [False  True False  True]

X[y == label]

 [[1 0 1 1]

 [1 0 1 0]]

sum: [2 0 2 1]

Feature counts:

 {0: array([0, 1, 0, 2]), 1: array([2, 0, 2, 1])}



#决策树

# 决策树

from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split


cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(

    cancer.data, cancer.target, stratify=cancer.target, random_state=42)

tree = DecisionTreeClassifier(random_state=0)

tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))

print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

输出:

Accuracy on training set: 1.000

Accuracy on test set: 0.937



#深度设置为4

tree = DecisionTreeClassifier(max_depth=4, random_state=0)

tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))

print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

输出:

Accuracy on training set: 0.988

Accuracy on test set: 0.951


##可视化决策树

from sklearn.tree import export_graphviz

export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"],

                feature_names=cancer.feature_names, impurity=False, filled=True)

import graphviz

with open("tree.dot") as f:

    dot_graph = f.read()

print(graphviz.Source(dot_graph))

决策树可视化输出没有做出来!~



#树的特征重要性
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=42)

tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)

print("Feature importances:")
print(tree.feature_importances_)
输出:
Feature importances:
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.01019737 0.04839825
 0.         0.         0.0024156  0.         0.         0.
 0.         0.         0.72682851 0.0458159  0.         0.
 0.0141577  0.         0.018188   0.1221132  0.01188548 0.        ]



#特征重要性输出
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(

    cancer.data, cancer.target, stratify=cancer.target, random_state=42)


tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)


print("Feature importances:")
print(tree.feature_importances_)


def plot_feature_importances_cancer(model):
    n_features = cancer.data.shape[1]
    plt.barh(np.arange(n_features),
             model.feature_importances_,
             align='center')
    
    plt.yticks(np.arange(n_features), cancer.feature_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)

plot_feature_importances_cancer(tree)
plt.show()

tree = mglearn.plots.plot_tree_not_monotone()

print(tree)

输出:

Feature importances: [0. 1.]

digraph Tree {

node [shape=box, style="filled", color="black"] ;

0 [label="X[1] <= -5.814\nsamples = 100\nvalue = [50, 50]", fillcolor="#ffffff"] ;

1 [label="samples = 25\nvalue = [25, 0]", fillcolor="#e58139"] ;

0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;

2 [label="X[1] <= 5.348\nsamples = 75\nvalue = [25, 50]", fillcolor="#9ccef2"] ;

0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;

3 [label="samples = 50\nvalue = [0, 50]", fillcolor="#399de5"] ;

2 -> 3 ;

4 [label="samples = 25\nvalue = [25, 0]", fillcolor="#e58139"] ;

2 -> 4 ;

}



#
import pandas as pd
import os
import mglearn
import matplotlib.pyplot as plt

ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))
# 绘制对数坐标图
plt.semilogy(ram_prices.date, ram_prices.price)

plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.show()

import os

import pandas as pd

import mglearn

import matplotlib.pyplot as plt

import numpy as np

from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeRegressor


ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))

print(ram_prices)

# use historical data to forecast prices after the year 2000

data_train = ram_prices[ram_prices.date < 2000]

data_test  = ram_prices[ram_prices.date >= 2000]


# predict prices based on date

X_train = data_train.date[:, np.newaxis]

# we use a log-transform to get a simpler relationship of data to target

y_train = np.log(data_train.price)


tree = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train)

linear_reg = LinearRegression().fit(X_train, y_train)


# predict on all data

X_all = ram_prices.date[:, np.newaxis]


pred_tree = tree.predict(X_all)

pred_lr = linear_reg.predict(X_all)


# undo log-transform

price_tree = np.exp(pred_tree)

price_lr = np.exp(pred_lr)


plt.semilogy(data_train.date, data_train.price, label="Training data")

plt.semilogy(data_test.date, data_test.price, label="Test data")

plt.semilogy(ram_prices.date, price_tree, label="Tree prediction")

plt.semilogy(ram_prices.date, price_lr, label="Linear prediction")

plt.legend()


plt.show()

numpy.newaxis使用

import numpy as np

x = np.arange(3)

print("x[np.newaxis]\n",x[np.newaxis])

print("x[np.newaxis,np.newaxis]\n",x[np.newaxis,np.newaxis])

print("x[:, np.newaxis]\n",x[:, np.newaxis])

print("x[:, np.newaxis, np.newaxis]\n",x[:, np.newaxis, np.newaxis])

print("*"*10)


y = np.arange(12).reshape(4,3)

print("y:\n",y)

print("y[:, 1]\n",y[:, 1])

print("y[:, 1, np.newaxis]\n",y[:, 1, np.newaxis])

print("y[1, :, np.newaxis]\n",y[1, :, np.newaxis])


输出:
x[np.newaxis]
 [[0 1 2]]
x[np.newaxis,np.newaxis]
 [[[0 1 2]]]
x[:, np.newaxis]
 [[0]
 [1]
 [2]]
x[:, np.newaxis, np.newaxis]
 [[[0]]

 [[1]]

 [[2]]]
**********
y:
 [[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
y[:, 1]
 [ 1  4  7 10]
y[:, 1, np.newaxis]
 [[ 1]
 [ 4]
 [ 7]
 [10]]
y[1, :, np.newaxis]
 [[3]
 [4]
 [5]]

enumerate函数:(输出索引值)
seasons = ['Spring', 'Summer', 'Fall', 'Winter']
print(list(enumerate(seasons)))
print(list(enumerate(seasons, start=1)))       # 下标从 1 开始
输出:
[(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
[(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]


zip的用法:
a1=[1,2,3]
a2=[4,5,6]
a3=[7,8,9]
a4=["a","b","c","d"]
zip1=zip(a1,a2,a3)
print(zip1)
for i in zip1:
    print(i)

输出:
<zip object at 0x000001D1CA798600>
(1, 4, 7)
(2, 5, 8)
(3, 6, 9)

# ravel()用法  将数组维度拉成一维数组
import numpy as np #导入numpy模块
 
a, b = np.mgrid[1:4:1, 2:3:1] #用mgrid()方法生成等差数组a,b
print("a:",a,"\n","b:",b) #打印a,b
 
Aftera = a.ravel() #用ravel()方法将数组a拉成一维数组
print("Aftera:",Aftera) #打印a
输出:
a: [[1]
 [2]
 [3]] 
 b: [[2]
 [2]
 [2]]
Aftera: [1 2 3]



#随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=42)
#5棵树
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)


# print(forest.estimators_)

fig, axes = plt.subplots(2, 3, figsize=(20, 10))
for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):
    ax.set_title("Tree {}".format(i))
    mglearn.plots.plot_tree_partition(X_train, y_train, tree, ax=ax)
    
mglearn.plots.plot_2d_separator(forest,
                                X_train,
                                fill=True,
                                ax=axes[-1, -1],
                                alpha=.4)
axes[-1, -1].set_title("Random Forest")
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.show()

print(i,ax,tree)
输出:
0 AxesSubplot(0.125,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=1872583848)
1 AxesSubplot(0.398529,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=794921487)
2 AxesSubplot(0.672059,0.53;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=111352301)
3 AxesSubplot(0.125,0.11;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=1853453896)
4 AxesSubplot(0.398529,0.11;0.227941x0.35) DecisionTreeClassifier(max_features='auto', random_state=213298710)


enumerate( )函数说明

1.enumerate()是python的内置函数

2.enumerate在字典上是枚举、列举的意思

list1=['Lady','Man','Girl','Boy'] 
print(list(enumerate(list1)))

list1=['Lady','Man','Girl','Boy'] 
print(list(enumerate(list1,1)))

#字符串 
a='abcde' 
print(list(enumerate(a)))  

#元组 
tup=('A','B','C','D') 
print(list(enumerate(tup,1)))  

#字典(遍历key,而不是value) 
dict={'A':2,'B':4,'C':8} 
print(list(enumerate(dict,1)))

#普通的for循环 
i = 0 
seq = ['one', 'two', 'three'] 
for element in seq:     
    print (i, seq[i])     
    i +=1 
print("*"*20,"\n")

#enumerate( ) 
seq = ['one', 'two', 'three'] 
for i, element in enumerate(seq):      
    print (i, element)
输出:
[(0, 'Lady'), (1, 'Man'), (2, 'Girl'), (3, 'Boy')]
[(1, 'Lady'), (2, 'Man'), (3, 'Girl'), (4, 'Boy')]
[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd'), (4, 'e')]
[(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D')]
[(1, 'A'), (2, 'B'), (3, 'C')]
0 one
1 two
2 three
******************** 

0 one
1 two
2 three



sklearn中的make_blobs的用法
data, label = make_blobs(n_features=2, n_samples=100, centers=3, random_state=3, cluster_std=[0.8, 2, 5])
  • n_features表示每一个样本有多少特征值(默认2个)
  • n_samples表示样本的个数,也即样本总数(默认100个)
  • centers是聚类中心点的个数,可以理解为label的种类数(默认3个)
  • random_state是随机种子,可以固定生成的数据 (default=None)
  • cluster_std设置每个类别的方差(默认1.0)



#梯度提升回归树
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))


gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))




#核支持向量机
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import mglearn

X, y = make_blobs(centers=4, random_state=8)
y = y % 2

print(X.shape,y.shape)

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()


from sklearn.svm import LinearSVC
from sklearn.datasets import make_blobs
import mglearn
import matplotlib.pyplot as plt

X, y = make_blobs(centers=4, random_state=8)
y = y % 2

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

linear_svm = LinearSVC().fit(X, y)

mglearn.plots.plot_2d_separator(linear_svm, X)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

np.hstack将参数元组的元素数组按水平方向进行叠加
import numpy as np
 
arr1 = np.array([[1,3], [2,4] ])
arr2 = np.array([[1,4], [2,6] ])
res = np.hstack((arr1, arr2))

print(arr1)
print(arr2)
print (res)
输出:
[[1 3]
 [2 4]]
[[1 4]
 [2 6]]
[[1 3 1 4]
 [2 4 2 6]]








评论 (0 个评论)

facelist doodle 涂鸦板

您需要登录后才可以评论 登录 | 立即注册

Copyright ©2011-2014 bbs.06climate.com All Rights Reserved.  Powered by Discuz! (京ICP-10201084)

本站信息均由会员发表,不代表气象家园立场,禁止在本站发表与国家法律相抵触言论

返回顶部