当前位置: 首页 > news >正文

6 回归集成:xgb、lgb、cat

 这个代码是从kaggle上拷贝过来的:

  1. 如何使用三个树模型模块化训练;
  2. 文本特征如何做,如何挖掘;
  3. 时间特征的处理;
  4. 模型权重集成;
import pandas as pd 
import math
import numpy as np 
import joblib 
import optunafrom lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.model_selection import *from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizerimport datetime
import gc
from sklearn.base import clonepd.set_option('display.max_columns', None)import warnings
warnings.filterwarnings("ignore")d_s = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/solution_example.csv')te_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')tr_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')tr_d.drop('id',axis=1,inplace=True)
te_d.drop('id',axis=1,inplace=True)tr_d['holiday_name'].fillna('None', inplace=True)
te_d['holiday_name'].fillna('None', inplace=True)def Process_Date(Df):Df['date'] = pd.to_datetime(Df['date'])Df['year'] = Df['date'].dt.yearDf['day'] = Df['date'].dt.dayDf['month'] = Df['date'].dt.monthDf['month_name'] = Df['date'].dt.month_name()Df['day_of_week'] = Df['date'].dt.day_name()Df['week'] = Df['date'].dt.isocalendar().weekDf['year_sin'] = np.sin(2 * np.pi * Df['year'])Df['year_cos'] = np.cos(2 * np.pi * Df['year'])Df['month_sin'] = np.sin(2 * np.pi * Df['month'] / 12) Df['month_cos'] = np.cos(2 * np.pi * Df['month'] / 12)Df['day_sin'] = np.sin(2 * np.pi * Df['day'] / 31)  Df['day_cos'] = np.cos(2 * np.pi * Df['day'] / 31)Df['group']=(Df['year']-2020)*48+Df['month']*4+Df['day']//7Df['total_holidays_month'] = Df.groupby(['year', 'month'])['holiday'].transform('sum')Df['total_shops_closed_week'] = Df.groupby(['year', 'week'])['shops_closed'].transform('sum')Df['group_sin'] = np.sin(2 * np.pi * Df['group'] / Df['group'].max())Df['group_cos'] = np.cos(2 * np.pi * Df['group'] / Df['group'].max())return Dftr_d = Process_Date(tr_d)
te_d = Process_Date(te_d)tr_d = tr_d[['warehouse', 'date', 'holiday_name', 'holiday', 'shops_closed','winter_school_holidays', 'school_holidays', 'year', 'day', 'month','month_name', 'day_of_week', 'week', 'year_sin', 'year_cos','month_sin', 'month_cos', 'day_sin', 'day_cos', 'group','total_holidays_month', 'total_shops_closed_week','group_sin', 'group_cos','orders']]le_month = LabelEncoder()
le_week = LabelEncoder()
le_war = LabelEncoder()tr_d['month_name'] = le_month.fit_transform(tr_d['month_name'])
tr_d['day_of_week'] = le_week.fit_transform(tr_d['day_of_week'])
tr_d['warehouse'] = le_war.fit_transform(tr_d['warehouse'])te_d['month_name'] = le_month.transform(te_d['month_name'])
te_d['day_of_week'] = le_week.transform(te_d['day_of_week'])
te_d['warehouse'] = le_war.transform(te_d['warehouse'])def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10):vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')vectors = vectorizer.fit_transform(df[text_column])svd = TruncatedSVD(n_components)x_sv = svd.fit_transform(vectors)tfidf_df = pd.DataFrame(x_sv)cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()]tfidf_df.columns = colsdf = df.reset_index(drop=True)df = pd.concat([df, tfidf_df], axis="columns")return dftr_d = apply_tfidf_svd(tr_d,'holiday_name')
te_d = apply_tfidf_svd(te_d,'holiday_name')tr_d.drop(['date','holiday_name'],axis=1,inplace=True)
te_d.drop(['date','holiday_name'],axis=1,inplace=True)print(f"Shape Of Train Data is {tr_d.shape}")
print(f"Shape Of Test Data is {te_d.shape}")%%time X = tr_d.drop('orders',axis=1)
y =tr_d['orders']def cross_validate(model, n_splits=15):scores = []test_preds = np.zeros(len(te_d))groups = X['group']kfold = GroupKFold(n_splits=n_splits)for fold, (train_index, valid_index) in enumerate(kfold.split(X, y, groups=groups)):X_train = X.iloc[train_index]y_train = y.iloc[train_index]X_val = X.iloc[valid_index]y_val = y.iloc[valid_index]m = clone(model)m.fit(X_train, y_train, eval_set=[(X_val, y_val)])y_pred = m.predict(X_val)score = mean_absolute_percentage_error(y_val, y_pred)scores.append(score)test_preds += m.predict(te_d) / n_splitsgc.collect()print(f" MAPE mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})")return test_preds%%timeSEED = 2375cat = CatBoostRegressor(verbose=0,learning_rate=0.01,iterations=2000,random_state = SEED)
cat_test_preds = cross_validate(cat)SEED = 1023
xgb = XGBRegressor(n_estimators=1000,learning_rate=0.05,verbosity=0,random_state=SEED)
xgb_test_preds = cross_validate(xgb)%%timelgb = LGBMRegressor(verbose=-1,random_state = SEED)
lgb_test_preds = cross_validate(lgb)%%time weights = {'cat_test_preds': 0.45,  'lgb_test_preds': 0.45,'xgb_test_preds': 0.1,}cat_test_preds_weighted = cat_test_preds * weights['cat_test_preds']
lgb_test_preds_weighted = lgb_test_preds * weights['lgb_test_preds']
xgb_test_preds_weighted = xgb_test_preds * weights['xgb_test_preds']ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weightedd_s['orders'] = ensemble_preds
d_s['id'] = d_s['id']d_s.to_csv('Submission.csv', index=False)print(d_s.head())

相关文章:

  • 北京网站建设多少钱?
  • 辽宁网页制作哪家好_网站建设
  • 高端品牌网站建设_汉中网站制作
  • Air780E/Air780EP/Air780EQ/Air201模块遇到死机问题如何分析
  • 最新!CSSCI(2023-2024)期刊目录公布!
  • AndroidStudio与手机进行无线调试
  • SRv6 BE 配置过程(VRF ping通场景)
  • 图——图的应用02最短路径(Dijkstra算法与Floyd算法详解),拓扑排序及关键路径
  • CSS3 教程
  • Flutter 中的基本数据类型:num、int 和 double
  • 极狐GitLab Git LFS(大文件存储)如何管理?
  • JSP静态包含与动态包含的区别
  • 基于 Go1.19 的站点模板爬虫:构建与实战
  • IDEA的常见代码模板的使用
  • 数据仓库的一致性维度
  • 如何在 Mac 上下载安装植物大战僵尸杂交版? 最新版本 2.2 详细安装运行教程问题详解
  • AWS服务器购买:如何选择合适的AWS云服务器
  • 大语言模型-检索测评指标
  • 《微软的软件测试之道》成书始末、出版宣告、补充致谢名单及相关信息
  • Java面向对象及其三大特征
  • JS进阶 - JS 、JS-Web-API与DOM、BOM
  • mac修复ab及siege安装
  • Mysql优化
  • rabbitmq延迟消息示例
  • TypeScript迭代器
  • yii2权限控制rbac之rule详细讲解
  • 第13期 DApp 榜单 :来,吃我这波安利
  • 二维平面内的碰撞检测【一】
  • 函数式编程与面向对象编程[4]:Scala的类型关联Type Alias
  • 机器人定位导航技术 激光SLAM与视觉SLAM谁更胜一筹?
  • 如何使用Mybatis第三方插件--PageHelper实现分页操作
  • 小李飞刀:SQL题目刷起来!
  • ionic入门之数据绑定显示-1
  • ​LeetCode解法汇总2583. 二叉树中的第 K 大层和
  • ​Redis 实现计数器和限速器的
  • #调用传感器数据_Flink使用函数之监控传感器温度上升提醒
  • (152)时序收敛--->(02)时序收敛二
  • (SERIES12)DM性能优化
  • (web自动化测试+python)1
  • (二)Kafka离线安装 - Zookeeper下载及安装
  • (二)springcloud实战之config配置中心
  • (分类)KNN算法- 参数调优
  • (附源码)springboot 校园学生兼职系统 毕业设计 742122
  • (附源码)springboot车辆管理系统 毕业设计 031034
  • (黑马C++)L06 重载与继承
  • (亲测有效)推荐2024最新的免费漫画软件app,无广告,聚合全网资源!
  • (原創) 如何動態建立二維陣列(多維陣列)? (.NET) (C#)
  • .gitignore文件---让git自动忽略指定文件
  • .NET CORE 2.0发布后没有 VIEWS视图页面文件
  • .net framework 4.8 开发windows系统服务
  • .Net mvc总结
  • @ 代码随想录算法训练营第8周(C语言)|Day57(动态规划)
  • @NoArgsConstructor和@AllArgsConstructor,@Builder
  • @select 怎么写存储过程_你知道select语句和update语句分别是怎么执行的吗?
  • [20171101]rman to destination.txt
  • [Android Studio 权威教程]断点调试和高级调试
  • [asp.net core]project.json(2)
  • [C#]winform基于深度学习算法MVANet部署高精度二分类图像分割onnx模型高精度图像二值化