-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path初识XGBoost.py
438 lines (298 loc) · 12.4 KB
/
初识XGBoost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 30 14:59:06 2017
@author: zdx
"""
'''
我安装的xgboost 跟用python安装的有什么区别,:1:python setup.py install xgboost cd
2:之前从网上下载一个xgboost 几十kb的 whl文件 xgboost-0.6-cp36-cp36m-win_amd64.whl
我现在的版本是xgboost 0.6 不是2.0 这个有很大的问题
UCI机器学习库的Mushroom 数据集 (XGBoost安装包中的demo数据)
根据蘑菇的22个特征判断蘑菇是否有毒
总样本数:8124
Demo中22维特征经过处理,变成了126维特征量
'''
import xgboost as xgb
from sklearn.metrics import accuracy_score
#读取数据 在xgboost安装路径下的demo目录
my_workpath = 'C:/Users/zdx/xgboost/demo/data/'
dtrain = xgb.DMatrix(my_workpath+'agaricus.txt.train')
dtest = xgb.DMatrix(my_workpath+'agaricus.txt.test')
dtrain.num_col()
dtrain.num_row()
dtest.num_row()
'设置训练参数'
param = {'max_depth':2, 'eta':1, 'silent':0, 'objective':'binary:logistic' }
'''
max_depth: 树的最大深度。缺省值为6,取值范围为:[1,∞]
• eta:为了防止过拟合,更新过程中用到的收缩步长。 eta通过缩减特征
的权重使提升计算过程更加保守。缺省值为0.3,取值范围为:[0,1]
• silent: 0表示打印出运行时信息,取1时表示以缄默方式运行,不打印
运行时信息。缺省值为0
• objective: 定义学习任务及相应的学习目标,“binary:logistic” 表示
二分类的逻辑回归问题,输出为概率。
模型训练
'''
# 设置boosting迭代计算次
num_round = 2
import time
starttime = time.clock()
bst = xgb.train(param, dtrain, num_round)
endtime = time.clock()
print (endtime - starttime)
'''
'预测(训练数据上评估)'
模型训练好后,可以用训练好的模型对进行预测
– XGBoost预测的输出是概率,输出值是样本为第一类的概率 à 将概率值转换
为0或1
'''
train_preds = bst.predict(dtrain)
train_predictions = [round(value) for value in train_preds]
y_train = dtrain.get_label()
train_accuracy = accuracy_score(y_train, train_predictions)
print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
'''
番外:模型可视化
可视化模型中的单颗树:调用XGBoost 的API plot_tree()/
to_graphviz()
'''
from xgboost import plot_tree
xgb.plot_tree(bst, num_trees=0, rankdir= 'UD' ) #将模型可视化
'第二个参数为要打印的树的索引(从0开始)'
"""
---------------------------------------------------------------
0---------------------------------------------------------------
与scikit-learn结合
XGBoost提供一个wrapper
和scikit-learn框架中其他分类器或回归器一样
"""
from xgboost import XGBClassifier
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
#读取数据
'支持libsvm 格式数据--------稀疏特征 每行表示一个样本,第一个开头是样本标签'
'之后为特征索引,冒号‘:’ 后面为该特征的值'
X_train,y_train = load_svmlight_file(my_workpath + 'agaricus.txt.train')
X_test,y_test = load_svmlight_file(my_workpath + 'agaricus.txt.test')
print(X_train.shape)
#设置boosting迭代计算次数,
print(X_train.shape)
print (X_test.shape)
num_round = 2
bst = xgb.XGBClassifier(max_depth=2, learning_rate=0.1,n_estimators=num_round, silent=True,
objective='binary:logistic')
# setup parameters for xgboost
param = {}
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param["eval_metric"] = "error"
param['eta'] = 0.3
param['gamma'] = 0
param['max_depth'] = 6
param['min_child_weight']=1
param['max_delta_step'] = 0
param['subsample']= 1
param['colsample_bytree']=1
param['silent'] = 1
param['seed'] = 0
param['base_score'] = 0.5
"""
测试结果-----------------#校验集 ----------->>>>>>>>>>>>>>>>>>>>>>>>
"""
# 设置boosting迭代计算次数
bst.fit(X_train, y_train)
train_preds = bst.predict(X_train)
train_predictions = [round(value) for value in train_preds]
train_accuracy = accuracy_score(y_train, train_predictions)
print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
# make prediction
preds = bst.predict(X_test)
predictions = [round(value) for value in preds]
test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
'''
----------------->>>> XGBoost快速入门——与scikit-learn一起使用-split
'''
'意思是 在实际场合中,测试数据位置,如何评估模型?'
from sklearn.model_selection import train_test_split
#split data into train and test sets,1/3 的训练数据作为校验数据
seed = 7#设置随机数种子,以至于每次迭代的时候,划分数据集合,每一个子份是一样的
test_size = 0.33#划分测试集合占训练集合的比率
my_workpath ='C:/Users/zdx/xgboost/demo/data/'
X_train,y_train = load_svmlight_file(my_workpath + 'agaricus.txt.train')
X_test,y_test = load_svmlight_file(my_workpath + 'agaricus.txt.test')
X_train.shape
#X_test.shape
X_train_part, X_validate, y_train_part, y_validate = train_test_split(X_train, y_train, test_size=test_size,
random_state=seed)
X_train_part.shape
# 设置boosting迭代计算次数
num_round = 2
#bst = XGBClassifier(param)
#bst = XGBClassifier()
bst =XGBClassifier(max_depth=2, learning_rate=1, n_estimators=num_round, silent=True, objective='binary:logistic')
bst.fit(X_train_part, y_train_part)
validare_preds = bst.predict(X_validate)
validate_predictions = [round(value) for value in validare_preds]
train_accuracy = accuracy_score(y_validate, validate_predictions)
print ("Validation Accuary: %.2f%%" % (train_accuracy * 100.0))
"""
----------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
学习曲线:模型预测性能随某个变化的学习参数
(如训练样本数目、迭代次数)变化的情况
– 例:XGBoost的迭代次数(树的数目)
#设置boosting 迭代计算次数
"""
num_round = 100
bst = xgb.XGBClassifier(max_depth=2, learning_rate=0.1,n_estimators=num_round, silent=True,
objective='binary:logistic')
'设置评估集'
eval_set = [ ( X_train_part,y_train_part), (X_validate,y_validate) ]
bst.fit(X_train_part,y_train_part,eval_metric= ["error","logloss" ],\
eval_set = eval_set,verbose = True )
"""
--------查看模型在训练集上的分类性能
XGBoost 预测的输出是概率,这里的分类是一个而分类问题,
"""
"""
模型每次校验集上的性能存在模型中,可用来进一步分析 model.evals result() 返回一个字典
:评估数据寄和分数
显示学习曲线
"""
#retrieve performance metrisc
results = bst.evals_result()
#print(results)
epochs = len(results['validation_0']['error'])
x_axis = range(0,epochs)
#plot log loss
fig,ax = pyplot.subplots()
ax.plot(x_axis,results['validation_0']['logloss'],label='Train')
ax.plot(x_axis,results['validation_1']['logloss'],label='Test')
ax.legend()
pyplot.ylabel("Log Loss")
pyplot.title("XGBoost Log Loss")
pyplot.show()
#plot classification error
fig,ax = pyplot.subplots()
ax.plot(x_axis,results['validation_0']['error'],label='Train')
ax.plot(x_axis,results['validation_1']['error'],label='Test')
ax.legend()
pyplot.ylabel("Classification Error")
pyplot.title("XGBoost Classification Error")
pyplot.show()
# make prediction
preds = bst.predict(X_test)
predictions = [round(value) for value in preds]
test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
"""
___________new way of sutitle_______________________>>>>>>>>>>>>>>>>>>>>>>
Early stop: 一种防止训练复杂模型过拟合的方法
– 监控模型在校验集上的性能:如果在经过固定次数的迭代,校验集上的性能
不再提高时,结束训练过程
– 当在测试集上的训练下降而在训练集上的性能还提高时,发生了过拟合
•使用准则 val_metric="error" 查看错误率
"""
# 运行 xgboost安装包中的示例程序
import xgboost as xgb
from xgboost import XGBClassifier
# 加载LibSVM格式数据模块
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
#观察接下来的10轮如果性能没有提升那么结束
seed = 7
test_size = 0.33
X_train_part, X_validate, y_train_part, y_validate= train_test_split(X_train, y_train, test_size=test_size,
random_state=seed)
X_train_part.shape
X_validate.shape
"""训练参数设置"
max_depth:树的最大深度; eta 为了防止过拟合,更新过程中用到的收缩步长
objective 定义学习任务以及相应的学习目标
binary:logistic 表示而分类的逻辑回归问题
"""
param = {'max_depth':2, 'eta':1, 'silent':0, 'objective':'binary:logistic' }
num_round = 100 # 设置boosting迭代计算次数
param = {'max_depth':2,'eta':1,'slient':0,'objective':'binary:logistic'}
bst = XGBClassifier(max_depth=2, learning_rate= 0.1 \
,n_estimators=num_round, silent=True, objective="binary:logistic")
eval_set =[(X_validate, y_validate)]
bst.fit(X_train_part, y_train_part, early_stopping_rounds=10, eval_metric="error",
eval_set=eval_set, verbose=True)
#retrieve performance metrisc----------显示学习曲线
results = bst.evals_result()
#print(results)
epochs = len(results['validation_0']['error'])
x_axis = range(0,epochs)
#plot classification error
fig,ax = pyplot.subplots()
ax.plot(x_axis,results['validation_0']['error'],label='Test')
ax.legend()
pyplot.ylabel("Classification Error")
pyplot.xlabel("Round")
pyplot.title("XGBoost earlly stop")
pyplot.show()
#X测试
# make prediction
preds = bst.predict(X_test)
predictions = [round(value) for value in preds]
test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
"""
--------------------------->>>>>>>>>>>>>>>>>>----------->>>>>>>>>>>>>>
k-折交叉验证:将训练数据等分成k份(k通常的取值为3、 5或10)
– 重复k次
• 每次留出一份做校验,其余k-1份做训练
– k次校验集上的平均性能视为模型在测试集上性能的估计
• 该估计比train_test_split得到的估计方差更小
如果每类样本不均衡或类别数较多,采用StratifiedKFold, 将数据集中每一类样本
的数据等分
"""
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score #对给定参数的单个模型评估
# 设置boosting迭代计算次数
num_round = 2
#num_round = range(1, 101)
#param_grid = dict(n_estimators=num_round)
#bst = XGBClassifier(param)
bst =XGBClassifier(max_depth=2, learning_rate=0.1,n_estimators=num_round,
silent=True, objective='binary:logistic')
#交叉验证
kfold = StratifiedKFold(n_splits = 10,random_state=7)#防止样本不均衡的方法
results = cross_val_score(bst,X_train,y_train,cv = kfold)
kfold = StratifiedKFold(n_splits=10, random_state=7)
results = cross_val_score(bst, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
print("CV Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100,results.std()*100))
"""
_____----------------->>>>>>>>>>>>>-----------------------------
---------------------参 数 调 整 -----------选择
参数调优GridSearchCV :我们可以根据交叉验证评估的结
果,选择最佳参数的模型
– 输入待调节参数的范围(grid),对一组参数对应的模型进行评估,
并给出最佳模型及其参数
-----------------网格搜索 --------
"""
from sklearn.grid_search import GridSearchCV
# 设置boosting迭代计算次数搜索范围
' 树的棵数
param_test = { #弱分类器的数目以及范围
'n_estimators':list(range(1, 51, 1))
}
#以下参数分别为 模型 评价参数范围 评估分数:准确率 交叉验证折数
clf = GridSearchCV(estimator = bst, param_grid = param_test, scoring = 'accuracy', cv=5)
clf.fit(X_train, y_train)
'模型最佳分数,最佳参数,最佳分数
' sholud be 50 variables
clf.grid_scores_, clf.best_params_, clf.best_score_
"""
测试 make prediction
"""
preds = clf.predict(X_test)
predictions = [round(value) for value in preds]
test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy of gridsearchcv: %.2f%%" % (test_accuracy * 100.0))