Source code for deepdow.data.augment

"""Collection of callable functions that augment deepdow tensors."""

import numpy as np
import torch


[docs]def prepare_standard_scaler(X, overlap=False, indices=None):
    """Compute mean and standard deviation for each channel.

    Parameters
    ----------
    X : np.ndarray
        Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.

    overlap : bool
        If False, then only using the most recent timestep. This will guarantee that not counting
        the same thing multiple times.

    indices : list or None
        List of indices to consider from the `X.shape[0]` dimension. If None
        then considering all the samples.

    Returns
    -------
    means : np.ndarray
        Mean of each channel. Shape `(n_channels,)`.

    stds : np.ndarray
        Standard deviation of each channel. Shape `(n_channels,)`.

    """
    indices = indices if indices is not None else list(range(len(X)))
    considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

    means = considered_values.mean(axis=(0, 2, 3))
    stds = considered_values.std(axis=(0, 2, 3))

    return means, stds


[docs]def prepare_robust_scaler(
    X, overlap=False, indices=None, percentile_range=(25, 75)
):
    """Compute median and percentile range for each channel.

    Parameters
    ----------
    X : np.ndarray
        Full features array of shape `(n_samples, n_channels, lookback, n_assets)`.

    overlap : bool
        If False, then only using the most recent timestep. This will guarantee that not counting
        the same thing multiple times.

    indices : list or None
        List of indices to consider from the `X.shape[0]` dimension. If None
        then considering all the samples.

    percentile_range : tuple
        The left and right percentile to consider. Needs to be in [0, 100].

    Returns
    -------
    medians : np.ndarray
        Median of each channel. Shape `(n_channels,)`.

    ranges : np.ndarray
        Interquantile range for each channel. Shape `(n_channels,)`.

    """
    if not 0 <= percentile_range[0] < percentile_range[1] <= 100:
        raise ValueError(
            "The percentile range needs to be in [0, 100] and left < right"
        )

    indices = indices if indices is not None else list(range(len(X)))
    considered_values = X[indices, ...] if overlap else X[indices, :, -1:, :]

    medians = np.median(considered_values, axis=(0, 2, 3))
    percentiles = np.percentile(
        considered_values, percentile_range, axis=(0, 2, 3)
    )  # (2, n_channels)

    ranges = percentiles[1] - percentiles[0]

    return medians, ranges


[docs]class Compose:
    """Meta transform inspired by torchvision.

    Parameters
    ----------
    transforms : list
        List of callables that represent transforms to be composed.

    """

    def __init__(self, transforms):
        self.transforms = transforms

[docs]    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
        """Transform.

        Parameters
        ----------
        X_sample : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)`.

        y_sample : torch.Tesnor
            Target vector of shape `(n_channels, horizon, n_assets)`.

        timestamps_sample : datetime
            Time stamp of the sample.

        asset_names
            Asset names corresponding to the last channel of `X_sample` and `y_sample`.

        Returns
        -------
        X_sample_new : torch.Tensor
            Transformed version of `X_sample`.

        y_sample_new : torch.Tesnor
            Transformed version of `y_sample`.

        timestamps_sample_new : datetime
            Transformed version of `timestamps_sample`.

        asset_names_new
            Transformed version of `asset_names`.
        """
        for t in self.transforms:
            X_sample, y_sample, timestamps_sample, asset_names = t(
                X_sample, y_sample, timestamps_sample, asset_names
            )

        return X_sample, y_sample, timestamps_sample, asset_names


[docs]class Dropout:
    """Set random elements of the input to zero with probability p.

    Parameters
    ----------
    p : float
        Probability of setting an element to zero.

    training : bool
        If False, then dropout disabled no matter what the `p` is. Note that if True then
        dropout enabled and at the same time all the elements are scaled by `1/p`.
    """

    def __init__(self, p=0.2, training=True):
        self.p = p
        self.training = training

[docs]    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
        """Perform transform.

        Parameters
        ----------
        X_sample : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)`.

        y_sample : torch.Tesnor
            Target vector of shape `(n_channels, horizon, n_assets)`.

        timestamps_sample : datetime
            Time stamp of the sample.

        asset_names
            Asset names corresponding to the last channel of `X_sample` and `y_sample`.

        Returns
        -------
        X_sample_new : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)` with some elements being set to zero.

        y_sample : torch.Tensor
            Same as input.

        timestamps_sample : datetime
            Same as input.

        asset_names
            Same as input.
        """
        X_sample_new = torch.nn.functional.dropout(
            X_sample, p=self.p, training=self.training
        )

        return X_sample_new, y_sample, timestamps_sample, asset_names


[docs]class Multiply:
    """Transform multiplying the feature tensor X with a constant."""

    def __init__(self, c=100):
        self.c = c

[docs]    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
        """Perform transform.

        Parameters
        ----------
        X_sample : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)`.

        y_sample : torch.Tesnor
            Target vector of shape `(n_channels, horizon, n_assets)`.

        timestamps_sample : datetime
            Time stamp of the sample.

        asset_names
            Asset names corresponding to the last channel of `X_sample` and `y_sample`.

        Returns
        -------
        X_sample_new : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)` multiplied by a constant `self.c`.

        y_sample : torch.Tesnor
            Same as input.

        timestamps_sample : datetime
            Same as input.

        asset_names
            Same as input.
        """
        return self.c * X_sample, y_sample, timestamps_sample, asset_names


[docs]class Noise:
    """Add noise to each of the channels.

    Random (Gaussian) noise is added to the original features X. One can control the standard deviation of the noise
    via the `frac` parameter. Mathematically, `std(X_noise) = std(X) * frac` for each channel.


    """

    def __init__(self, frac=0.2):
        self.frac = frac

[docs]    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
        """Perform transform.

        Parameters
        ----------
        X_sample : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)`.

        y_sample : torch.Tensor
            Target vector of shape `(n_channels, horizon, n_assets)`.

        timestamps_sample : datetime
            Time stamp of the sample.

        asset_names
            Asset names corresponding to the last channel of `X_sample` and `y_sample`.

        Returns
        -------
        X_sample_new : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)` with some added noise.

        y_sample : torch.Tesnor
            Same as input.

        timestamps_sample : datetime
            Same as input.

        asset_names
            Same as input.
        """
        X_sample_new = (
            self.frac
            * X_sample.std([1, 2], keepdim=True)
            * torch.randn_like(X_sample)
            + X_sample
        )

        return X_sample_new, y_sample, timestamps_sample, asset_names


[docs]class Scale:
    """Scale input features.

    The input features are per channel centered to zero and scaled to one. We use the same
    terminology as scikit-learn. However, the equivalent in torchvision is `Normalize`.

    Parameters
    ----------
    center : np.ndarray
        1D array of shape `(n_channels,)` representing the center of the features (mean or median).
        Needs to be precomputed in advance.

    scale : np.ndarray
        1D array of shape `(n_channels,)` representing the scale of the features (standard deviation
        or quantile range). Needs to be precomputed in advance.

    See Also
    --------
    prepare_robust_scaler
    prepare_standard_scaler
    """

    def __init__(self, center, scale):
        if len(center) != len(scale):
            raise ValueError(
                "The center and scale need to have the same size."
            )

        if np.any(scale <= 0):
            raise ValueError("The scale parameters need to be positive.")

        self.center = center
        self.scale = scale
        self.n_channels = len(self.center)

[docs]    def __call__(self, X_sample, y_sample, timestamps_sample, asset_names):
        """Perform transform.

        Parameters
        ----------
        X_sample : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)`.

        y_sample : torch.Tensor
            Target vector of shape `(n_channels, horizon, n_assets)`.

        timestamps_sample : datetime
            Time stamp of the sample.

        asset_names
            Asset names corresponding to the last channel of `X_sample` and `y_sample`.

        Returns
        -------
        X_sample_new : torch.Tensor
            Feature vector of shape `(n_channels, lookback, n_assets)` scaled appropriately.

        y_sample : torch.Tesnor
            Same as input.

        timestamps_sample : datetime
            Same as input.

        asset_names
            Same as input.
        """
        n_channels = X_sample.shape[0]
        if n_channels != self.n_channels:
            raise ValueError(
                "Expected {} channels in X, got {}".format(
                    self.n_channels, n_channels
                )
            )

        X_sample_new = X_sample.clone()
        dtype, device = X_sample_new.dtype, X_sample_new.device

        center = torch.as_tensor(self.center, dtype=dtype, device=device)[
            :, None, None
        ]
        scale = torch.as_tensor(self.scale, dtype=dtype, device=device)[
            :, None, None
        ]

        X_sample_new.sub_(center).div_(scale)

        return X_sample_new, y_sample, timestamps_sample, asset_names