当前位置: 首页 > news >正文

Quantile RNN

文章目录

  • 数据预处理
  • pipline
  • evaluation
  • model
  • example

在这里插入图片描述

数据预处理

import numpy as np
from toolz.curried import *


@curry
def clean_nan(dataset, how='any'):
    return dataset.dropna(how=how)


@curry
def lagger(dataset, n_lags, price_columns):
    df = reduce(
        lambda df, lag: df.assign(**{col + str(lag): dataset[[col]].shift(-lag).values for col in price_columns}),
        range(1, n_lags + 1),
        dataset[price_columns])

    result = df.assign(**{col: dataset[col] for col in dataset.drop(price_columns, axis=1).columns})
    return result[sorted(result.columns)]


@curry
def diff_log_pricer(dataset, price_columns, date_column):
    """
    Takes the first difference of the logs of temporal data

    Parameters
    ----------
    dataset : pandas.DataFrame
        A Pandas' DataFrame with a Date Column and one or many price column.
        The price column must be of numerical time and not contain nones

    price_columns : list of str
        A list with the names of the price columns

    date_column : str
        The name of the date column. The column must be of type datetime.

    Returns
    ----------
    new_df : pandas.DataFrame
        A df like DataFrame with the price column replaced by the log difference in time.
        The first row will contain NaNs due to first diferentiation.
    """

    # Sorting the dataframe
    sort_fn = lambda df: df.sort_values(by=date_column)

    # Applying log to each value
    log_fn = lambda df: df.assign(**{col: np.log(df[col]) for col in price_columns})

    # Calculating the difference
    diff_fn = lambda df: df.assign(
        **{col: 100 * (df[col] - df[col].shift(1)) for col in price_columns}).reset_index(drop=True)

    return compose(diff_fn, log_fn, sort_fn)(dataset)


@curry
def time_split_dataset(df, train_start_date, train_end_date, holdout_end_date, date_col):
    """
    Splits temporal data into a training and testing datasets such that
    all training data comes before the testings set.

    Parameters
    ----------
    df : pandas.DataFrame
        A Pandas' DataFrame with an Identifier Column and a Date Column.
        The model will be trained to predict the target column
        from the features.

    train_start_date : str
        A date string representing a the starting time of the training data.
        It should be in the same format as the Date Column in `dataset`.
        Inclusive in the train set

    train_end_date : str
        A date string representing a the ending time of the training data.
        This will also be used as the start date of the holdout period.
        It should be in the same format as the Date Column in `dataset`.
        Inclusive in the train set. Exclusive in the test set.

    holdout_end_date : str
        A date string representing a the ending time of the holdout data.
        It should be in the same format as the Date Column in `dataset`.
        Inclusive in the test set.

    date_col : str
        The name of the Date column of `dataset`.


    Returns
    ----------
    train_set : pandas.DataFrame
        The in ID sample and in time training set.

    test_set : pandas.DataFrame
        The out of time testing set.
    """

    train_set = df.copy()[
        (df[date_col] >= train_start_date) & (df[date_col] <= train_end_date)]

    test_set = df.copy()[
        (df[date_col] > train_end_date) & (df[date_col] <= holdout_end_date)]

    return train_set, test_set

pipline

@curry
def pipeline(dataset, learners):
    return pipe(learners,
                reversed,
                reduce(comp))(dataset)

evaluation

@curry
def quantile_loss_evaluator(df, predict_col, target_col, tau):
    y_true = df[[target_col]].values
    y_hat = df[[predict_col]].values
    return np.mean((tau - (y_true < y_hat)) * (y_true - y_hat))

@curry
def proportion_of_hits_evaluator(df, predict_col, target_col):
    y_true = df[[target_col]].values
    y_hat = df[[predict_col]].values
    return np.mean(y_hat > y_true)

model

# coding=utf-8

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam


def qrnn_learner(dataset, price_cols, target_col, prediction_col="prediction",
                 tau=0.05, neurons=20, lr=1e-4, batch_size=512, epochs=5):
    def to_3D(dataset):
        all_p_columns = pipe(dataset.columns,
                             filter(lambda col: reduce(lambda acc, p_col: acc or col.find(p_col) >= 0,
                                                       price_cols, False)),
                             filter(lambda col: col != target_col),
                             list)

        def p(new_data):
            return new_data[all_p_columns].values.reshape(-1,
                                                          int(len(all_p_columns) / len(price_cols)),
                                                          len(price_cols))

        return p, p(dataset)

    def quantile_loss(y_true, y_pred):
        ro = tau - tf.cast(tf.greater(y_pred, y_true), tf.float32)
        return tf.reduce_mean(ro * (y_true - y_pred))

    _3Dnator, x_train = to_3D(dataset)
    y_train = dataset[[target_col]].values
    n_samples, timesteps, n_vars = x_train.shape

    # build model
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(timesteps, n_vars)))
    model.add(Dense(1, activation=None))
    opt = Adam(lr=lr)
    model.compile(loss=quantile_loss, optimizer=opt)

    # train model
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=0)

    def p(new_dataset):
        x_new = _3Dnator(new_dataset)
        return new_dataset.assign(**{prediction_col: model.predict(x_new)})

    return p, p(dataset)

example

from matplotlib import pyplot as plt
plt.style.use("ggplot")

tau = 0.95
N_LAGS=14
PRICE_COLS = ["Adjusted Close"]

data  = pd.read_csv('SP500.csv', usecols=['Adjusted Close', 'Date'])
data.head()

differ_learner = diff_log_pricer(price_columns=PRICE_COLS, date_column="Date")
lagger_learner = lagger(n_lags=N_LAGS, price_columns=PRICE_COLS)
na_clearn_learner = clean_nan(how="any")

pipe_learner = pipeline(learners=[differ_learner, lagger_learner, na_clearn_learner])
processed_data = pipe_learner(data)

splitter = time_split_dataset(train_start_date="1960-01-01", train_end_date="2010-01-01", holdout_end_date="2016-01-01", date_col="Date")
train, test = splitter(processed_data)
train.head()

在这里插入图片描述

var_estimator, train_pred = qrnn_learner(train, price_cols=PRICE_COLS, target_col="Adjusted Close14", epochs=10, lr=1e-3, tau= tau)
test_pred = var_estimator(test)

quantile_eval_fn = quantile_loss_evaluator(predict_col="prediction",
                                           target_col="Adjusted Close14",
                                           tau= tau)

hits_eval_fn = proportion_of_hits_evaluator(predict_col="prediction",
                                                target_col="Adjusted Close14")
print("Quantile loss on train %f and test %f sets" % tuple(map(quantile_eval_fn, [train_pred, test_pred])))
print("Number of hits on train %f and test %f sets" % tuple(map(hits_eval_fn, [train_pred, test_pred])))
"""
Quantile loss on train 0.094278 and test 0.091880 sets
Number of hits on train 0.941602 and test 0.940397 sets
"""

plt.figure(figsize=(20, 6))
plt.plot(test_pred["Date"], test_pred["prediction"])
plt.plot(test_pred["Date"], test_pred["Adjusted Close14"])

在这里插入图片描述

相关文章:

  • 正式开始homeR的计划
  • 非线性状态空间模型与非线性自回归模型的联系
  • matplotlib 颜色名称表
  • 分布式缓存BeIT Memcached简介
  • 【pytorch】时间序列预测 —— 同时预测多个分位点
  • 关于Oracle 顽固的KILLED 状态的SESSION的处理
  • 科技论文的写作逻辑
  • C#中字符串操作函数
  • BibTex 的使用
  • C# 运算符
  • Takens 定理
  • Visual C# 2005中编写Socket网络程序
  • win10 手动设置 DNS 地址
  • C# 语言规范
  • 傅里叶变换求解 KdV 方程
  • JS中 map, filter, some, every, forEach, for in, for of 用法总结
  • 自己简单写的 事件订阅机制
  • 【翻译】Mashape是如何管理15000个API和微服务的(三)
  • 【跃迁之路】【444天】程序员高效学习方法论探索系列(实验阶段201-2018.04.25)...
  • AWS实战 - 利用IAM对S3做访问控制
  • CSS 提示工具(Tooltip)
  • HTTP中的ETag在移动客户端的应用
  • Java 多线程编程之:notify 和 wait 用法
  • 数据科学 第 3 章 11 字符串处理
  • 跳前端坑前,先看看这个!!
  • ​如何防止网络攻击?
  • $.each()与$(selector).each()
  • ()、[]、{}、(())、[[]]等各种括号的使用
  • (04)odoo视图操作
  • (webRTC、RecordRTC):navigator.mediaDevices undefined
  • (附源码)ssm教材管理系统 毕业设计 011229
  • (一)WLAN定义和基本架构转
  • (已更新)关于Visual Studio 2019安装时VS installer无法下载文件,进度条为0,显示网络有问题的解决办法
  • (转)Sql Server 保留几位小数的两种做法
  • ***汇编语言 实验16 编写包含多个功能子程序的中断例程
  • .Net Attribute详解(上)-Attribute本质以及一个简单示例
  • .NET MAUI学习笔记——2.构建第一个程序_初级篇
  • .NET/ASP.NETMVC 大型站点架构设计—迁移Model元数据设置项(自定义元数据提供程序)...
  • .NET开发不可不知、不可不用的辅助类(一)
  • @ConfigurationProperties注解对数据的自动封装
  • @transactional 方法执行完再commit_当@Transactional遇到@CacheEvict,你的代码是不是有bug!...
  • @zabbix数据库历史与趋势数据占用优化(mysql存储查询)
  • @开发者,一文搞懂什么是 C# 计时器!
  • [ 云计算 | AWS ] AI 编程助手新势力 Amazon CodeWhisperer:优势功能及实用技巧
  • [acwing周赛复盘] 第 94 场周赛20230311
  • [AIGC] 如何建立和优化你的工作流?
  • [BZOJ1877][SDOI2009]晨跑[最大流+费用流]
  • [BZOJ4554][TJOI2016HEOI2016]游戏(匈牙利)
  • [CUDA 学习笔记] CUDA kernel 的 grid_size 和 block_size 选择
  • [leetcode 数位计算]2520. 统计能整除数字的位数
  • [LeetCode]Balanced Binary Tree
  • [Real world Haskell] 中文翻译:第二章 类型与函数
  • [uni-app] uni.showToast 一闪而过问题/设定时间无效/1秒即逝
  • [UVA 11825] Hackers' Crackdown
  • [WCF安全系列]谈谈WCF的客户端认证[用户名/密码认证]