当前位置: 首页 > news >正文

竞赛实战--天池金融风控分类问题

背景

1、金融风控分类问题,作为机器学习竞赛是一个比较好的选择
2、如何进行数据处理

代码

数据分析部分

#!/usr/bin/env python
# coding: utf-8import os
import gc
import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kstestwarnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
# plt.ion()

# ## 导入数据
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, 'data')train_data_file = os.path.join(DATA_PATH, "train.csv")
train_data = pd.read_csv(train_data_file)test_data_file = os.path.join(DATA_PATH, "testA.csv")
test_data = pd.read_csv(test_data_file)target = train_data['isDefault']
train_data = train_data.drop(['isDefault'], axis=1)data = pd.concat([train_data, test_data])objectList = [i for i in train_data.columns if train_data[i].dtype == 'O']
classList = [i for i in train_data.select_dtypes(exclude=['object']).columns if len(train_data[i].unique()) <= 10]
numericalList = [i for i in train_data.select_dtypes(exclude=['object']).columns if i not in classList]

对不同类型变量进行分类分组处理

# ## 变量分类和缺失值处理
info = pd.DataFrame(data.isnull().sum())
info = info[info[0] != 0]
miss_fea = info.indexmiss_objectList = [i for i in miss_fea if i in objectList]
miss_classList = [i for i in miss_fea if i in classList]
miss_numericalList = [i for i in miss_fea if i in numericalList]# 填补缺失值
data['employmentLength'] = data['employmentLength'].fillna(0)
data['n11'] = data['n11'].fillna(0)
data['n12'] = data['n12'].fillna(0)
data['employmentTitle'] = data['employmentTitle'].fillna(data['employmentTitle'].mode()[0])
data['postCode'] = data['postCode'].fillna(data['postCode'].mode()[0])
data['dti'] = data['dti'].fillna(data['postCode'].mean())
data['pubRecBankruptcies'] = data['pubRecBankruptcies'].fillna(data['pubRecBankruptcies'].mean())
data['revolUtil'] = data['revolUtil'].fillna(data['revolUtil'].mean())
data['title'] = data['title'].fillna(data['title'].mode()[0])NoNameList = [i for i in miss_numericalList if i.startswith("n")]
for i in NoNameList:data[i] = data[i].fillna(data[i].mode()[0])# ## object 变量处理
data['employmentLength'].replace({'10+ years': '10 years', '< 1 year': '0 years', '0': '0 years'}, inplace=True)
data['employmentLength'] = data['employmentLength'].apply(lambda s: int(str(s).split()[0]) if pd.notnull(s) else s)data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data = data.drop(['issueDate'], axis=1)le = LabelEncoder()
data['grade'] = le.fit_transform(data['grade'])
data['subGrade'] = le.fit_transform(data['subGrade'])# 删除不需要的列
dropList = ['id', 'ficoRangeHigh', 'applicationType', 'policyCode', 'n3', 'n11', 'n12', 'n13']
data.drop(dropList, axis=1, inplace=True)train_data = data[:800000]
# 将target和train_data进行重新拼接
train_data['isDefault']=target
test_data = data[800000:]
print("Divide data.")
# # ## 异常值处理
# percentile = pd.DataFrame()
# numList = [i for i in train_data.columns if i not in classList]# # 正态分布检测
# for i in numList:
#     print(kstest(data[i], 'norm', (data[i].mean(), data[i].std())))# # 异常值处理
# stdsc = StandardScaler()
# for i in numList:
#     new_i = "zheng_" + i
#     train_data[new_i] = stdsc.fit_transform(train_data[i].values.reshape(-1, 1))
#     data_std = np.std(train_data[new_i])
#     data_mean = np.mean(train_data[new_i])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     train_data = train_data[(train_data[new_i] < upper_rule) & (train_data[new_i] > lower_rule)]
# train_data = train_data.iloc[:, :38]

保存数据,在部分情况下由于数据体量过大,保存中间数据有助于后续处理。

FEATURE_PATH = os.path.join(BASE_DIR, 'feature')
feature_train_data = os.path.join(FEATURE_PATH, 'train_data.csv')
feature_test_data = os.path.join(FEATURE_PATH, 'test_data.csv')
train_data.to_csv(feature_train_data,index=0)
test_data.to_csv(feature_test_data,index=0)

模型搭建部分

# 定义模型训练函数
def train_model(x_train, y_train, test_data, params, n_splits=5):skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)oof = np.zeros(len(x_train))predictions = np.zeros((len(test_data), n_splits))for fold_, (train_idx, valid_idx) in enumerate(skf.split(x_train, y_train)):print(f"\nFold {fold_ + 1}")x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[valid_idx]y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]train_set = lgb.Dataset(x_tr, label=y_tr)val_set = lgb.Dataset(x_val, label=y_val)clf = lgb.train(params, train_set, 5000, valid_sets=[val_set], verbose_eval=250, early_stopping_rounds=50)oof[valid_idx] = clf.predict(x_val, num_iteration=clf.best_iteration)predictions[:, fold_] = clf.predict(test_data, num_iteration=clf.best_iteration)print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(y_train, oof)))return oof, predictions# 训练模型并生成预测
oof, predictions = train_model(x_train_gbdt, y_train_gbdt, x_test_bgdt, default_params)

参考资料

相关文章:

  • 北京网站建设多少钱?
  • 辽宁网页制作哪家好_网站建设
  • 高端品牌网站建设_汉中网站制作
  • 启动Spring Boot报错
  • 英飞凌WiFi驱动WHD
  • 使用变长的参数列
  • 国家超算互联网入选国家数据局“全国一体化算力网应用优秀案例”
  • 豆包MarsCode编程助手:让编程更简单
  • DPDK:RTE_PMD_REGISTER_PCI 的原型
  • 【iOS】暑期学习总结
  • Windows使用ffmpeg获取麦克风数据
  • 秋招智能体,Offer没难题
  • Netlify 为静态站点部署 Waline 评论系统
  • 智能提醒助理系列-协作工具,一站式软件研发管理平台
  • STM32F103ZETx_FLASH.ld 解析
  • 库(Library)
  • Kafka 常用的传输和序列化数据方式
  • 51单片机——实时时钟
  • [原]深入对比数据科学工具箱:Python和R 非结构化数据的结构化
  • 【挥舞JS】JS实现继承,封装一个extends方法
  • 002-读书笔记-JavaScript高级程序设计 在HTML中使用JavaScript
  • Java新版本的开发已正式进入轨道,版本号18.3
  • java正则表式的使用
  • JWT究竟是什么呢?
  • Laravel 中的一个后期静态绑定
  • python学习笔记-类对象的信息
  • spark本地环境的搭建到运行第一个spark程序
  • Zepto.js源码学习之二
  • 简单实现一个textarea自适应高度
  • 老板让我十分钟上手nx-admin
  • 码农张的Bug人生 - 初来乍到
  • 面试题:给你个id,去拿到name,多叉树遍历
  • 前端技术周刊 2019-02-11 Serverless
  • 设计模式走一遍---观察者模式
  • ​LeetCode解法汇总518. 零钱兑换 II
  • ​ssh免密码登录设置及问题总结
  • ​云纳万物 · 数皆有言|2021 七牛云战略发布会启幕,邀您赴约
  • #Datawhale AI夏令营第4期#AIGC方向 文生图 Task2
  • #我与Java虚拟机的故事#连载15:完整阅读的第一本技术书籍
  • (2024.6.23)最新版MAVEN的安装和配置教程(超详细)
  • (done) NLP “bag-of-words“ 方法 (带有二元分类和多元分类两个例子)词袋模型、BoW
  • (html5)在移动端input输入搜索项后 输入法下面为什么不想百度那样出现前往? 而我的出现的是换行...
  • (Mirage系列之二)VMware Horizon Mirage的经典用户用例及真实案例分析
  • (附源码)ssm基于jsp的在线点餐系统 毕业设计 111016
  • (论文阅读40-45)图像描述1
  • (三)c52学习之旅-点亮LED灯
  • (转载)CentOS查看系统信息|CentOS查看命令
  • *上位机的定义
  • .L0CK3D来袭:如何保护您的数据免受致命攻击
  • .NET Core使用NPOI导出复杂,美观的Excel详解
  • .net core使用RPC方式进行高效的HTTP服务访问
  • .NET Framework .NET Core与 .NET 的区别
  • .net websocket 获取http登录的用户_如何解密浏览器的登录密码?获取浏览器内用户信息?...
  • .NET 程序如何获取图片的宽高(框架自带多种方法的不同性能)
  • .NET 发展历程
  • .NetCore实践篇:分布式监控Zipkin持久化之殇
  • .net中生成excel后调整宽度
  • [ 隧道技术 ] 反弹shell的集中常见方式(四)python反弹shell