0%

网格搜索与交叉验证

Hey

Machine Learning notes

网格搜索与交叉验证(网格搜索必须要在main下定义)

k折交叉验证调节超参数

使用与样本数量较小的数据样本(以万基)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, accuracy_score
import pandas as pd

data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=0.2, random_state=0)
# print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
dec_clf = DecisionTreeClassifier(random_state=0)
dec_clf.fit(X_train, y_train)
print(dec_clf.score(X_test, y_test))

parameters = {'max_depth': range(1, 6), 'min_samples_leaf': [1, 2, 3]}
# 构建一个打分器
scoring_fnc = make_scorer(accuracy_score)
kfold = KFold(n_splits=10)

# for train_index,test_index in kfold.split(range(10)):
# print(train_index,test_index)

# 网格搜索
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(dec_clf, parameters, scoring_fnc, cv=kfold)
grid = grid.fit(X_train, y_train)
reg = grid.best_estimator_

print("best score:", grid.best_score_)
print('best parameters: ', grid.best_params_)
print(reg.score(X_test, y_test))

# for key in parameters.keys():
# print(key," : ",reg.get_params()[key])
# 显示交叉验证的过程
print(pd.DataFrame(grid.cv_results_).T)

注意

k折交叉验证法是将训练集分为k组,用每1组作为测试集而剩下的k-1组的训练集作为训练数据集,从而可以测试k个训练模型

交叉验证是用于调节参数,最大的利用训练集寻找最优的超参数

过拟合:训练集准确率高,测试集准确率较低

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# sklearn进行交叉验证----单一超参数
# 交叉验证得分cross_val_score()
from sklearn.model_selection import cross_val_score
# estimator 训练器的选择
# X为训练集上的特征
# y是训练集中要预测的标签
# cv是交叉验证分为几折
cross_val_score(estimator,X,y=None,cv=None)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

k_range = [2, 4, 5, 10] #k选择2,4,5,10四个参数
cv_scores = [] #分别放用4个参数训练得到的精确度

for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
#*****下面这句进行了交叉验证**********
scores = cross_val_score(knn, X_train_scaled, y_train, cv=3)#进行3折交叉验证,返回的是3个值即每次验证的精确度

cv_score = np.mean(scores)# 把某个k对应的精确度求平均值
print('k={},验证集上的准确率={:.3f}'.format(k, cv_score))
cv_scores.append(cv_score)

# 验证曲线validation_curve()
sklearn.model_selection.validation_curve(estimator, X, y, param_name, param_range, cv=None, scoring=None)
参数:
- estimator: 训练器
- X: 训练集上选择的特征
- y:训练集上的要预测值
- param_name :变化的参数的名称
- param_range : 参数变化的范围
- cv:交叉验证的折数
- scoring:采用的模型评价标准
from sklearn.model_selection import validation_curve
train_scores, test_scores = validation_curve(SVC(kernel='linear'), X_train_scaled, y_train, param_name='C', param_range=c_range, cv=5, scoring='accuracy')# 通过验证曲线得到不同取值的C在验证集合训练集上的得分。

随机搜索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)