当前位置: 首页 > news >正文

动手学深度学习(Pytorch版)代码实践 -深度学习基础-13Kaggle竞赛:2020加州房价预测

13Kaggle竞赛:2020加州房价预测

# 导入所需的库
import numpy as np
import pandas as pd
import torch
import hashlib
import os
import tarfile
import zipfile
import requests
from torch import nn
from d2l import torch as d2l# 读取训练和测试数据
train_data = pd.read_csv('../data/california-house-prices/train.csv')
test_data = pd.read_csv('../data/california-house-prices/test.csv')# 打印数据形状
# print(train_data.shape)
# print(test_data.shape)
# (47439, 41)
# (31626, 40)# 打印前4行的部分列
# print(train_data.iloc[0:4, [0, 1, 2, 3, 4, 5, 6, -3, -2, -1]])# 合并训练和测试数据,用于特征工程
all_features = pd.concat((train_data.iloc[:, train_data.columns != 'Sold Price'], test_data.iloc[:, 1:]))
# all_features.info()
# print(all_features.shape)
# (79065, 40)# 去除ID列
all_features = all_features.iloc[:, 1:]# 将字符型日期列转化为日期型
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")# 标准化数值特征
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)# 打印每个字符型特征的唯一值数量
# for in_object in all_features.dtypes[all_features.dtypes == 'object'].index:
#     print(in_object.ljust(20), len(all_features[in_object].unique()))
"""
in_object.ljust(20):将列名左对齐,并填充空格使其长度至少为20个字符,这样打印时更整齐。
len(all_features[in_object].unique()):计算该列中唯一值的数量。
便于后续的独热编码,防止内存爆炸
"""# 选择需要的特征
features = list(numeric_features)
features.extend(['Type'])
all_features = all_features[features[:]]# 独热编码
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=float)
# print(all_features.shape)
# (79065, 195)# 查看全部特征的数据类型
# print(all_features.dtypes.unique())# 从pandas格式中提取NumPy格式,并将其转换为张量表示用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1),dtype=torch.float32
)# 是否使用GPU训练
if not torch.cuda.is_available():print('CUDA is not available. Training on CPU ...')
else:print('CUDA is available. Training on GPU ...')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")# 将特征和标签移到设备上
train_features = train_features.to(device)
test_features = test_features.to(device)
train_labels = train_labels.to(device)# 定义均方误差损失函数
loss = nn.MSELoss()# 输入特征的数量
in_features = train_features.shape[1]
# print(in_features)
# 195# 定义神经网络模型
dropout1, dropout2, dropout3 = 0.2, 0.3, 0.5def get_net():net = nn.Sequential(nn.Flatten(),nn.Linear(in_features, 128), nn.ReLU(),# nn.Dropout(dropout1),nn.Linear(128, 64), nn.ReLU(),# nn.Dropout(dropout2),nn.Linear(64, 32), nn.ReLU(),# nn.Dropout(dropout3),nn.Linear(32, 1))return net.to(device)  # 使用GPU# 计算对数均方根误差
def log_rmse(net, features, labels):"""使用 torch.clamp 函数将预测值的下限限制在 1,确保所有预测值至少为 1。这是为了避免在取对数时出现负值或零值,因为对数在这些点上未定义或会导致数值问题。"""clipped_preds = torch.clamp(net(features), 1, float('inf'))rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))# 将 PyTorch 张量转换为 Python 标量return rmse.item()# 训练模型函数
def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size):train_ls, tets_ls = [], []  # 用于存储每个epoch的训练和测试损失train_iter = d2l.load_array((train_features, train_labels), batch_size)  # 创建训练数据迭代器optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate,weight_decay=weight_decay)  # 定义Adam优化器# weight_decay: 权重衰减,用于L2正则化。for epoch in range(num_epochs):for X, y in train_iter:X, y = X.to(device), y.to(device)  # 确保批次数据在GPU上optimizer.zero_grad()  # 梯度清零l = loss(net(X), y)  # 计算损失l.backward()  # 反向传播optimizer.step()  # 更新模型参数# 计算并记录训练集上的对数均方根误差。train_ls.append(log_rmse(net, train_features, train_labels))if test_labels is not None:# 计算并记录测试集上的对数均方根误差tets_ls.append(log_rmse(net, test_features, test_labels))return train_ls, tets_ls# K折交叉验证
# 它选择第i个切片作为验证数据,其余部分作为训练数据
def get_k_fold_data(k, i, X, y):assert k > 1fold_size = X.shape[0] // kX_train, y_train = None, Nonefor j in range(k):idx = slice(j * fold_size, (j + 1) * fold_size)X_part, y_part = X[idx, :], y[idx]if j == i:X_valid, y_valid = X_part, y_partelif X_train is None:X_train, y_train = X_part, y_partelse:X_train = torch.cat([X_train, X_part], 0)y_train = torch.cat([y_train, y_part], 0)return X_train.to(device), y_train.to(device), X_valid.to(device), y_valid.to(device)  # 确保在GPU上# 在K折交叉验证中训练K次后,返回训练和验证误差的平均值。
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,batch_size):train_l_sum, valid_l_sum = 0, 0for i in range(k):data = get_k_fold_data(k, i, X_train, y_train)net = get_net()train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,weight_decay, batch_size)train_l_sum += train_ls[-1]# 将 train_ls 列表中的最新值(即当前 epoch 的训练损失)累加到 train_l_sum 变量中。valid_l_sum += valid_ls[-1]if i == 0:d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],legend=['train', 'valid'], yscale='log')print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, 'f'验证log rmse{float(valid_ls[-1]):f}')return train_l_sum / k, valid_l_sum / k# 定义训练参数
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0, 256# 进行K折交叉验证
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, 'f'平均验证log rmse: {float(valid_l):f}')d2l.plt.show() # 提交Kaggle预测
def train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size):net = get_net()train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',ylabel='log rmse', xlim=[1, num_epochs], yscale='log')print(f'训练log rmse:{float(train_ls[-1]):f}')# 将网络应用于测试集,并将结果从GPU转移到CPU再转换为NumPy数组preds = net(test_features).detach().cpu().numpy()# 将其重新格式化以导出到Kaggletest_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)submission.to_csv('../data/california-house-prices/submission.csv', index=False)# 训练模型并进行预测
train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size)d2l.plt.show()

运行结果:

1,训练log rmse0.356748, 验证log rmse0.3316662,训练log rmse0.337252, 验证log rmse0.3418753,训练log rmse0.317294, 验证log rmse0.3245164,训练log rmse0.337175, 验证log rmse0.3606255,训练log rmse0.356537, 验证log rmse0.379667
5-折验证: 平均训练log rmse: 0.341001, 平均验证log rmse: 0.347670
训练log rmse:0.307162

在这里插入图片描述
在这里插入图片描述

竞赛得分:

在这里插入图片描述

相关文章:

  • jnp.linalg.norm
  • 1. C++面向过程
  • 强化安全新篇章:韶关石油化工可燃气体报警器年检解析
  • 收费4980的AI批量混剪,素材技术方法工具配套,详细拆解!
  • Mongodb UPDATE使用$sort将数组重新排序
  • 【嵌入式开发】UART
  • grpc代理服务的实现(一)
  • 硬引用、软引用、弱引用、虚引用和原子引用
  • 架构风格-系统架构师(十五
  • Pipeline知识小记
  • 复分析——第6章—— Γ 函数和 ζ 函数(E.M. Stein R. Shakarchi)
  • PCL 点云RANSAC+SVD提取平面
  • 一文了解HarmonyOSNEXT发布重点内容
  • ubuntu22.04安装onlyoffice社区版
  • React-配置json-server
  • [LeetCode] Wiggle Sort
  • 2017前端实习生面试总结
  • HTML5新特性总结
  • iOS 颜色设置看我就够了
  • learning koa2.x
  • linux安装openssl、swoole等扩展的具体步骤
  • MySQL常见的两种存储引擎:MyISAM与InnoDB的爱恨情仇
  • Protobuf3语言指南
  • 电商搜索引擎的架构设计和性能优化
  • 看域名解析域名安全对SEO的影响
  • 聊聊springcloud的EurekaClientAutoConfiguration
  • 与 ConTeXt MkIV 官方文档的接驳
  • raise 与 raise ... from 的区别
  • 机器人开始自主学习,是人类福祉,还是定时炸弹? ...
  • 树莓派用上kodexplorer也能玩成私有网盘
  • ​DB-Engines 12月数据库排名: PostgreSQL有望获得「2020年度数据库」荣誉?
  • ​Python 3 新特性:类型注解
  • ​埃文科技受邀出席2024 “数据要素×”生态大会​
  • (差分)胡桃爱原石
  • (附源码)spring boot校园拼车微信小程序 毕业设计 091617
  • (黑马出品_高级篇_01)SpringCloud+RabbitMQ+Docker+Redis+搜索+分布式
  • (强烈推荐)移动端音视频从零到上手(上)
  • (算法)大数的进制转换
  • (淘宝无限适配)手机端rem布局详解(转载非原创)
  • (转)Java socket中关闭IO流后,发生什么事?(以关闭输出流为例) .
  • (转)jdk与jre的区别
  • (转)scrum常见工具列表
  • (转)关于多人操作数据的处理策略
  • .htaccess配置重写url引擎
  • .NET Core 实现 Redis 批量查询指定格式的Key
  • .net遍历html中全部的中文,ASP.NET中遍历页面的所有button控件
  • .NET导入Excel数据
  • .NET建议使用的大小写命名原则
  • .NET中使用Protobuffer 实现序列化和反序列化
  • @vueup/vue-quill使用quill-better-table报moduleClass is not a constructor
  • [ 常用工具篇 ] POC-bomber 漏洞检测工具安装及使用详解
  • [52PJ] Java面向对象笔记(转自52 1510988116)
  • [Angular 基础] - 表单:响应式表单
  • [Angular] 笔记 9:list/detail 页面以及@Output
  • [ASP.NET 控件实作 Day7] 设定工具箱的控件图标