Source code for deepdow.data.augment

"""Collection of callable functions that augment deepdow tensors."""

import numpy as np
import torch


[docs]def prepare_standard_scaler(X, overlap=False, indices=None): """Compute mean and standard deviation for each channel. Parameters ---------- X : np.ndarray Full features array of shape `(n_samples, n_channels, lookback, n_assets)`. overlap : bool If False, then only using the most recent timestep. This will guarantee that not counting the same thing multiple times. indices : list or None List of indices to consider from the `X.shape[0]` dimension. If None then considering all the samples. Returns ------- means : np.ndarray Mean of each channel. Shape `(n_channels,)`. stds : np.ndarray Standard deviation of each channel. Shape `(n_channels,)`. """ indices = indices if indices is not None else list(range(len(X))) considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :] means = considered_values.mean(axis=(0, 2, 3)) stds = considered_values.std(axis=(0, 2, 3)) return means, stds
[docs]def prepare_robust_scaler( X, overlap=False, indices=None, percentile_range=(25, 75) ): """Compute median and percentile range for each channel. Parameters ---------- X : np.ndarray Full features array of shape `(n_samples, n_channels, lookback, n_assets)`. overlap : bool If False, then only using the most recent timestep. This will guarantee that not counting the same thing multiple times. indices : list or None List of indices to consider from the `X.shape[0]` dimension. If None then considering all the samples. percentile_range : tuple The left and right percentile to consider. Needs to be in [0, 100]. Returns ------- medians : np.ndarray Median of each channel. Shape `(n_channels,)`. ranges : np.ndarray Interquantile range for each channel. Shape `(n_channels,)`. """ if not 0 <= percentile_range[0] < percentile_range[1] <= 100: raise ValueError( "The percentile range needs to be in [0, 100] and left < right" ) indices = indices if indices is not None else list(range(len(X))) considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :] medians = np.median(considered_values, axis=(0, 2, 3)) percentiles = np.percentile( considered_values, percentile_range, axis=(0, 2, 3) ) # (2, n_channels) ranges = percentiles[1] - percentiles[0] return medians, ranges
[docs]class Compose: """Meta transform inspired by torchvision. Parameters ---------- transforms : list List of callables that represent transforms to be composed. """ def __init__(self, transforms): self.transforms = transforms
[docs] def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): """Transform. Parameters ---------- X_sample : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)`. y_sample : torch.Tesnor Target vector of shape `(n_channels, horizon, n_assets)`. timestamps_sample : datetime Time stamp of the sample. asset_names Asset names corresponding to the last channel of `X_sample` and `y_sample`. Returns ------- X_sample_new : torch.Tensor Transformed version of `X_sample`. y_sample_new : torch.Tesnor Transformed version of `y_sample`. timestamps_sample_new : datetime Transformed version of `timestamps_sample`. asset_names_new Transformed version of `asset_names`. """ for t in self.transforms: X_sample, y_sample, timestamps_sample, asset_names = t( X_sample, y_sample, timestamps_sample, asset_names ) return X_sample, y_sample, timestamps_sample, asset_names
[docs]class Dropout: """Set random elements of the input to zero with probability p. Parameters ---------- p : float Probability of setting an element to zero. training : bool If False, then dropout disabled no matter what the `p` is. Note that if True then dropout enabled and at the same time all the elements are scaled by `1/p`. """ def __init__(self, p=0.2, training=True): self.p = p self.training = training
[docs] def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): """Perform transform. Parameters ---------- X_sample : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)`. y_sample : torch.Tesnor Target vector of shape `(n_channels, horizon, n_assets)`. timestamps_sample : datetime Time stamp of the sample. asset_names Asset names corresponding to the last channel of `X_sample` and `y_sample`. Returns ------- X_sample_new : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)` with some elements being set to zero. y_sample : torch.Tensor Same as input. timestamps_sample : datetime Same as input. asset_names Same as input. """ X_sample_new = torch.nn.functional.dropout( X_sample, p=self.p, training=self.training ) return X_sample_new, y_sample, timestamps_sample, asset_names
[docs]class Multiply: """Transform multiplying the feature tensor X with a constant.""" def __init__(self, c=100): self.c = c
[docs] def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): """Perform transform. Parameters ---------- X_sample : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)`. y_sample : torch.Tesnor Target vector of shape `(n_channels, horizon, n_assets)`. timestamps_sample : datetime Time stamp of the sample. asset_names Asset names corresponding to the last channel of `X_sample` and `y_sample`. Returns ------- X_sample_new : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)` multiplied by a constant `self.c`. y_sample : torch.Tesnor Same as input. timestamps_sample : datetime Same as input. asset_names Same as input. """ return self.c * X_sample, y_sample, timestamps_sample, asset_names
[docs]class Noise: """Add noise to each of the channels. Random (Gaussian) noise is added to the original features X. One can control the standard deviation of the noise via the `frac` parameter. Mathematically, `std(X_noise) = std(X) * frac` for each channel. """ def __init__(self, frac=0.2): self.frac = frac
[docs] def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): """Perform transform. Parameters ---------- X_sample : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)`. y_sample : torch.Tensor Target vector of shape `(n_channels, horizon, n_assets)`. timestamps_sample : datetime Time stamp of the sample. asset_names Asset names corresponding to the last channel of `X_sample` and `y_sample`. Returns ------- X_sample_new : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)` with some added noise. y_sample : torch.Tesnor Same as input. timestamps_sample : datetime Same as input. asset_names Same as input. """ X_sample_new = ( self.frac * X_sample.std([1, 2], keepdim=True) * torch.randn_like(X_sample) + X_sample ) return X_sample_new, y_sample, timestamps_sample, asset_names
[docs]class Scale: """Scale input features. The input features are per channel centered to zero and scaled to one. We use the same terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`. Parameters ---------- center : np.ndarray 1D array of shape `(n_channels,)` representing the center of the features (mean or median). Needs to be precomputed in advance. scale : np.ndarray 1D array of shape `(n_channels,)` representing the scale of the features (standard deviation or quantile range). Needs to be precomputed in advance. See Also -------- prepare_robust_scaler prepare_standard_scaler """ def __init__(self, center, scale): if len(center) != len(scale): raise ValueError( "The center and scale need to have the same size." ) if np.any(scale <= 0): raise ValueError("The scale parameters need to be positive.") self.center = center self.scale = scale self.n_channels = len(self.center)
[docs] def __call__(self, X_sample, y_sample, timestamps_sample, asset_names): """Perform transform. Parameters ---------- X_sample : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)`. y_sample : torch.Tensor Target vector of shape `(n_channels, horizon, n_assets)`. timestamps_sample : datetime Time stamp of the sample. asset_names Asset names corresponding to the last channel of `X_sample` and `y_sample`. Returns ------- X_sample_new : torch.Tensor Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately. y_sample : torch.Tesnor Same as input. timestamps_sample : datetime Same as input. asset_names Same as input. """ n_channels = X_sample.shape[0] if n_channels != self.n_channels: raise ValueError( "Expected {} channels in X, got {}".format( self.n_channels, n_channels ) ) X_sample_new = X_sample.clone() dtype, device = X_sample_new.dtype, X_sample_new.device center = torch.as_tensor(self.center, dtype=dtype, device=device)[ :, None, None ] scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[ :, None, None ] X_sample_new.sub_(center).div_(scale) return X_sample_new, y_sample, timestamps_sample, asset_names