数据挖掘竞赛预测模型——五折交叉验证

数据挖掘竞赛预测模型——五折交叉验证 2023-07-12 1140

使用catboost进行五折交叉验证

import numpy as np
import pandas as pd
import catboost as cbt
from sklearn.metric import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold


train_data = pd.read_csv(.....)
train_label = pd.read_csv(.....)
test_data = pd.read_csv(.....)


cat_list = [] #catboost中需要处理的离散特征属性列
oof = np.zeros(train_data.shape[0])  #训练集长度
prediction = np.zeros(test_data.shape[0])     #测试集长度
seeds = [2017,2018,2019,2020]     #随机种子
num_model_seed = 1


train_x, test_x, train_y, test_y = train_test_split(train_data, train_label, test_size=0.2, random_state = 2019)     #拆分训练集
for model_seed in range(num_model_seed):     #选用几个随机种子
    oof_cat = np.zeros(train_data.shape[0])
    prediction_cat = np.zeros(test_data.shape[0])
    skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True) #五折交叉验证，shuffle表示是否打乱数据，若为True再设定随机种子random_state
    for index, (train_index, test_index) in enumerate(skf.split(train_data, train_label)): #将数据五折分割
        train_x, test_x, train_y, test_y = train_data.iloc[train_index], test_data.iloc[test_index], train_label.iloc[train_index], train_label.iloc[test_index]
        cbt_model = cbt.CatBoostClassifier(iterations=5000, learning_rate=0.1, max_depth=7, verbose=100, early_stopping_rounds=500,task_type=GPU, eval_metric=F1,cat_features=cat_list)     #设置模型参数，verbose表示每100个训练输出打印一次
        cbt_model.fit(train_x, train_y, eval_set=(test_x, test_y)) #训练五折分割后的训练集
        gc.collect() #垃圾清理，内存清理
        oof_cat[test_index] += cbt_model.predict_proba(test_x)[:,1] #
        prediction_cat += cbt_model.predict_proba(test_data)[:,1]/5
    print(F1, f1_score(train_label, np.round(oof_cat)))
    oof += oof_cat / num_model_seed     #五折训练集取均值
    prediction += prediction_cat / num_model_seed #测试集取均值
print(score, f1_score(train_label, np.round(oof)))

##结果写入csv文件
submit = test_data[id]
submit[label] = (prediction>=0.499).astype(int)
print(submit[label].value_counts())
submit.to_csv(submission.csv,index=False)

免费搭建微信查券返利机器人来轻松赚佣金

文章来自:IT技术分享网
分享地址:http://www.5ityx.cn/cate100/336906.html

上一篇： .gitignore 文件不生效问题 & 解决方法

下一篇： powermockito测试私有方法_PowerMock单元测试踩坑与总结

数据挖掘竞赛预测模型——五折交叉验证

数据挖掘竞赛预测模型——五折交叉验证 相关内容

聚合标签

数据挖掘竞赛预测模型——五折交叉验证相关内容