Data preprocessing

Functions used to preprocess time series (both X and y).

from tsai.data.external import get_UCR_data

dsid = 'NATOPS'
X, y, splits = get_UCR_data(dsid, return_split=False)
tfms = [None, Categorize()]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)

source

ToNumpyCategory

 ToNumpyCategory (**kwargs)

Categorize a numpy batch

t = ToNumpyCategory()
y_cat = t(y)
y_cat[:10]

array([3, 2, 2, 3, 2, 4, 0, 5, 2, 1])

test_eq(t.decode(tensor(y_cat)), y)
test_eq(t.decode(np.array(y_cat)), y)

source

OneHot

 OneHot (n_classes=None, **kwargs)

One-hot encode/ decode a batch

oh_encoder = OneHot()
y_cat = ToNumpyCategory()(y)
oht = oh_encoder(y_cat)
oht[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

n_classes = 10
n_samples = 100

t = torch.randint(0, n_classes, (n_samples,))
oh_encoder = OneHot()
oht = oh_encoder(t)
test_eq(oht.shape, (n_samples, n_classes))
test_eq(torch.argmax(oht, dim=-1), t)
test_eq(oh_encoder.decode(oht), t)

n_classes = 10
n_samples = 100

a = np.random.randint(0, n_classes, (n_samples,))
oh_encoder = OneHot()
oha = oh_encoder(a)
test_eq(oha.shape, (n_samples, n_classes))
test_eq(np.argmax(oha, axis=-1), a)
test_eq(oh_encoder.decode(oha), a)

source

TSNan2Value

 TSNan2Value (value=0, median=False, by_sample_and_var=True,
              sel_vars=None)

Replaces any nan values by a predefined value or median

o = TSTensor(torch.randn(16, 10, 100))
o[0,0] = float('nan')
o[o > .9] = float('nan')
o[[0,1,5,8,14,15], :, -20:] = float('nan')
nan_vals1 = torch.isnan(o).sum()
o2 = Pipeline(TSNan2Value(), split_idx=0)(o.clone())
o3 = Pipeline(TSNan2Value(median=True, by_sample_and_var=True), split_idx=0)(o.clone())
o4 = Pipeline(TSNan2Value(median=True, by_sample_and_var=False), split_idx=0)(o.clone())
nan_vals2 = torch.isnan(o2).sum()
nan_vals3 = torch.isnan(o3).sum()
nan_vals4 = torch.isnan(o4).sum()
test_ne(nan_vals1, 0)
test_eq(nan_vals2, 0)
test_eq(nan_vals3, 0)
test_eq(nan_vals4, 0)

o = TSTensor(torch.randn(16, 10, 100))
o[o > .9] = float('nan')
o = TSNan2Value(median=True, sel_vars=[0,1,2,3,4])(o)
test_eq(torch.isnan(o[:, [0,1,2,3,4]]).sum().item(), 0)

source

TSStandardize

 TSStandardize (mean=None, std=None, by_sample=False, by_var=False,
                by_step=False, exc_vars=None, eps=1e-08,
                use_single_batch=True, verbose=False, **kwargs)

Standardizes batch of type TSTensor

Args: - mean: you can pass a precalculated mean value as a torch tensor which is the one that will be used, or leave as None, in which case it will be estimated using a batch. - std: you can pass a precalculated std value as a torch tensor which is the one that will be used, or leave as None, in which case it will be estimated using a batch. If both mean and std values are passed when instantiating TSStandardize, the rest of arguments won’t be used. - by_sample: if True, it will calculate mean and std for each individual sample. Otherwise based on the entire batch. - by_var: * False: mean and std will be the same for all variables. * True: a mean and std will be be different for each variable. * a list of ints: (like [0,1,3]) a different mean and std will be set for each variable on the list. Variables not included in the list won’t be standardized. * a list that contains a list/lists: (like[0, [1,3]]) a different mean and std will be set for each element of the list. If multiple elements are included in a list, the same mean and std will be set for those variable in the sublist/s. (in the example a mean and std is determined for variable 0, and another one for variables 1 & 3 - the same one). Variables not included in the list won’t be standardized. - by_step: if False, it will standardize values for each time step. - exc_vars: list of variables that won’t be standardized. - eps: it avoids dividing by 0 - use_single_batch: if True a single training batch will be used to calculate mean & std. Else the entire training set will be used.

batch_tfms=[TSStandardize(by_sample=True, by_var=False, verbose=True)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, batch_tfms=batch_tfms)
xb, yb = next(iter(dls.train))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)

exc_vars = [0, 2, 6, 8, 12]
batch_tfms=[TSStandardize(by_var=True, exc_vars=exc_vars)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, batch_tfms=batch_tfms)
xb, yb = next(iter(dls.train))
test_eq(len(dls.train.after_batch.fs[0].mean.flatten()), 24)
test_eq(len(dls.train.after_batch.fs[0].std.flatten()), 24)
test_eq(dls.train.after_batch.fs[0].mean.flatten()[exc_vars].cpu(), torch.zeros(len(exc_vars)))
test_eq(dls.train.after_batch.fs[0].std.flatten()[exc_vars].cpu(), torch.ones(len(exc_vars)))
print(dls.train.after_batch.fs[0].mean.flatten().data)
print(dls.train.after_batch.fs[0].std.flatten().data)

tensor([ 0.0000, -1.3398,  0.0000,  0.9952, -0.8438, -0.4308,  0.0000, -0.6077,
         0.0000,  0.7781, -0.4869, -0.0969,  0.0000, -1.0620, -0.6171,  0.9253,
        -0.7023, -0.3077, -0.5600, -1.1922, -0.7503,  0.9491, -0.7744, -0.4356])
tensor([1.0000, 0.8743, 1.0000, 0.7510, 1.1557, 0.5370, 1.0000, 0.2666, 1.0000,
        0.2380, 0.4047, 0.3274, 1.0000, 0.6371, 0.2798, 0.5287, 0.8642, 0.4297,
        0.5842, 0.7581, 0.3162, 0.6739, 1.0118, 0.4958])

from tsai.data.validation import TimeSplitter

X_nan = np.random.rand(100, 5, 10)
idxs = random_choice(len(X_nan), int(len(X_nan)*.5), False)
X_nan[idxs, 0] = float('nan')
idxs = random_choice(len(X_nan), int(len(X_nan)*.5), False)
X_nan[idxs, 1, -10:] = float('nan')
batch_tfms = TSStandardize(by_var=True)
dls = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
test_eq(torch.isnan(dls.after_batch[0].mean).sum(), 0)
test_eq(torch.isnan(dls.after_batch[0].std).sum(), 0)
xb = first(dls.train)[0]
test_ne(torch.isnan(xb).sum(), 0)
test_ne(torch.isnan(xb).sum(), torch.isnan(xb).numel())
batch_tfms = [TSStandardize(by_var=True), Nan2Value()]
dls = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
xb = first(dls.train)[0]
test_eq(torch.isnan(xb).sum(), 0)

batch_tfms=[TSStandardize(by_sample=True, by_var=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = next(iter(dls.valid))
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)

tfms = [None, TSClassification()]
batch_tfms = TSStandardize(by_sample=True)
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=True)
xb, yb = dls.train.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = dls.valid.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)

tfms = [None, TSClassification()]
batch_tfms = TSStandardize(by_sample=True, by_var=False, verbose=False)
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=False)
xb, yb = dls.train.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)
xb, yb = dls.valid.one_batch()
test_close(xb.mean(), 0, eps=1e-1)
test_close(xb.std(), 1, eps=1e-1)

source

TSNormalize

 TSNormalize (min=None, max=None, range=(-1, 1), by_sample=False,
              by_var=False, by_step=False, clip_values=True,
              use_single_batch=True, verbose=False, **kwargs)

Normalizes batch of type TSTensor

mul_max’]

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

mul_min’]

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

batch_tfms = [TSNormalize()]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1

batch_tfms=[TSNormalize(by_sample=True, by_var=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1

batch_tfms = [TSNormalize(by_var=[0, [1, 2]], use_single_batch=False, clip_values=False, verbose=False)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb[:, [0, 1, 2]].max() <= 1
assert xb[:, [0, 1, 2]].min() >= -1

source

TSStandardizeTuple

 TSStandardizeTuple (x_mean, x_std, y_mean=None, y_std=None, eps=1e-05)

Standardizes X (and y if provided)

a, b = TSTensor([1., 2, 3]), TSTensor([4., 5, 6])
mean, std = a.mean(), b.std()
tuple_batch_tfm = TSStandardizeTuple(mean, std)
a_tfmd, b_tfmd = tuple_batch_tfm((a, b))
test_ne(a, a_tfmd)
test_ne(b, b_tfmd)

source

TSCatEncode

 TSCatEncode (a, sel_var)

Encodes a variable based on a categorical array

# static input
a = np.random.randint(10, 20, 512)[:, None, None].repeat(10, 1).repeat(28, 2)
b = TSTensor(torch.randint(0, 30, (512,), device='cpu').unsqueeze(-1).unsqueeze(-1).repeat(1, 10, 28))
output = TSCatEncode(a, sel_var=0)(b)
test_eq(0 <= output[:, 0].min() <= len(np.unique(a)), True)
test_eq(0 <= output[:, 0].max() <= len(np.unique(a)), True)
test_eq(output[:, 0], output[:, 0, 0][:, None].repeat(1, 28))
output[:, 0].data

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [4, 4, 4,  ..., 4, 4, 4],
        [4, 4, 4,  ..., 4, 4, 4],
        [0, 0, 0,  ..., 0, 0, 0]])

# non-static input
a = np.random.randint(10, 20, 512)[:, None, None].repeat(10, 1).repeat(28, 2)
b = TSTensor(torch.randint(0, 30, (512, 10, 28), device='cpu'))
output = TSCatEncode(a, sel_var=0)(b)
test_eq(0 <= output[:, 0].min() <= len(np.unique(a)), True)
test_eq(0 <= output[:, 0].max() <= len(np.unique(a)), True)
test_ne(output[:, 0], output[:, 0, 0][:, None].repeat(1, 28))
output[:, 0].data

tensor([[10,  0,  0,  ...,  4,  0,  0],
        [10,  0,  0,  ...,  0,  0,  0],
        [ 0,  2,  0,  ...,  0, 10,  6],
        ...,
        [ 1,  0,  9,  ...,  0,  0,  0],
        [ 0,  0,  5,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  5]])

source

TSDropFeatByKey

 TSDropFeatByKey (key_var, p, sel_vars, sel_steps=None, **kwargs)

Randomly drops selected features at selected steps based with a given probability per feature, step and a key variable

	Type	Default	Details
key_var			int representing the variable that contains the key information
p			array of shape (n_keys, n_features, n_steps) representing the probabilities of dropping a feature at a given step for a given key
sel_vars			int or slice or list of ints or array of ints representing the variables to drop
sel_steps	NoneType	None	int or slice or list of ints or array of ints representing the steps to drop
kwargs

n_devices = 4
key_var = 0

for sel_vars in [1, [1], [1,3,5], slice(3, 5)]:
    for sel_steps in [None, -1, 27, [27], [25, 26], slice(10, 20)]:
        o = TSTensor(torch.rand(512, 10, 28))
        o[:, key_var] = torch.randint(0, n_devices, (512, 28))
        n_vars = 1 if isinstance(sel_vars, Integral) else len(sel_vars) if isinstance(sel_vars, list) else sel_vars.stop - sel_vars.start
        n_steps = o.shape[-1] if sel_steps is None else 1 if isinstance(sel_steps, Integral) else \
            len(sel_steps) if isinstance(sel_steps, list) else sel_steps.stop - sel_steps.start
        p = torch.rand(n_devices, n_vars, n_steps) * .5 + .5
        output = TSDropFeatByKey(key_var, p, sel_vars, sel_steps)(o)
        assert torch.isnan(output).sum((0, 2))[sel_vars].sum() > 0
        assert torch.isnan(output).sum((0, 2))[~np.array(np.arange(o.shape[1])[sel_vars])].sum() == 0

source

TSClipOutliers

 TSClipOutliers (min=None, max=None, by_sample=False, by_var=False,
                 use_single_batch=False, verbose=False, **kwargs)

Clip outliers batch of type TSTensor based on the IQR

batch_tfms=[TSClipOutliers(-1, 1, verbose=True)]
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
xb, yb = next(iter(dls.train))
assert xb.max() <= 1
assert xb.min() >= -1
test_close(xb.min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1)
xb, yb = next(iter(dls.valid))
test_close(xb.min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1)

TSClipOutliers min=-1, max=1

source

TSClip

 TSClip (min=-6, max=6, **kwargs)

Clip batch of type TSTensor

t = TSTensor(torch.randn(10, 20, 100)*10)
test_le(TSClip()(t).max().item(), 6)
test_ge(TSClip()(t).min().item(), -6)

source

TSSelfMissingness

 TSSelfMissingness (sel_vars=None, **kwargs)

Applies missingness from samples in a batch to random samples in the batch for selected variables

t = TSTensor(torch.randn(10, 20, 100))
t[t>.8] = np.nan
t2 = TSSelfMissingness()(t.clone())
t3 = TSSelfMissingness(sel_vars=[0,3,5,7])(t.clone())
assert (torch.isnan(t).sum() < torch.isnan(t2).sum()) and (torch.isnan(t2).sum() >  torch.isnan(t3).sum())

source

TSRobustScale

 TSRobustScale (median=None, iqr=None, quantile_range=(25.0, 75.0),
                use_single_batch=True, exc_vars=None, eps=1e-08,
                verbose=False, **kwargs)

This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)

batch_tfms = TSRobustScale(verbose=True, use_single_batch=False)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, batch_tfms=batch_tfms, num_workers=0)
xb, yb = next(iter(dls.train))
xb.min()

TSRobustScale median=torch.Size([1, 24, 1]) iqr=torch.Size([1, 24, 1])

TSTensor([-2.3502116203308105], device=cpu, dtype=torch.float32)

exc_vars = [0, 2, 6, 8, 12]
batch_tfms = TSRobustScale(use_single_batch=False, exc_vars=exc_vars)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, batch_tfms=batch_tfms, num_workers=0)
xb, yb = next(iter(dls.train))
test_eq(len(dls.train.after_batch.fs[0].median.flatten()), 24)
test_eq(len(dls.train.after_batch.fs[0].iqr.flatten()), 24)
test_eq(dls.train.after_batch.fs[0].median.flatten()[exc_vars].cpu(), torch.zeros(len(exc_vars)))
test_eq(dls.train.after_batch.fs[0].iqr.flatten()[exc_vars].cpu(), torch.ones(len(exc_vars)))
print(dls.train.after_batch.fs[0].median.flatten().data)
print(dls.train.after_batch.fs[0].iqr.flatten().data)

tensor([ 0.0000, -1.7305,  0.0000,  0.7365, -1.2736, -0.5528,  0.0000, -0.7074,
         0.0000,  0.7087, -0.7014, -0.1120,  0.0000, -1.3332, -0.5958,  0.7563,
        -1.0129, -0.3985, -0.5186, -1.5125, -0.7353,  0.7326, -1.1495, -0.5359])
tensor([1.0000, 4.2788, 1.0000, 4.8008, 8.0682, 2.2777, 1.0000, 0.6955, 1.0000,
        1.4875, 2.6386, 1.4756, 1.0000, 2.9811, 1.2507, 3.2291, 5.9906, 1.9098,
        1.3428, 3.6368, 1.3689, 4.4213, 6.9907, 2.1939])

source

TSGaussianStandardize

 TSGaussianStandardize (E_mean:np.ndarray, S_mean:np.ndarray,
                        E_std:np.ndarray, S_std:np.ndarray, eps=1e-08,
                        split_idx=0, **kwargs)

Scales each batch using modeled mean and std based on UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958

	Type	Default	Details
E_mean	np.ndarray		Mean expected value
S_mean	np.ndarray		Uncertainty (standard deviation) of the mean
E_std	np.ndarray		Standard deviation expected value
S_std	np.ndarray		Uncertainty (standard deviation) of the standard deviation
eps	float	1e-08	(epsilon) small amount added to standard deviation to avoid deviding by zero
split_idx	int	0	Flag to indicate to which set is this transofrm applied. 0: training, 1:validation, None:both
kwargs

source

get_random_stats

 get_random_stats (E_mean, S_mean, E_std, S_std)

source

get_stats_with_uncertainty

 get_stats_with_uncertainty (o, sel_vars=None,
                             sel_vars_zero_mean_unit_var=False, bs=64,
                             n_trials=None, axis=(0, 2))

arr = np.random.rand(1000, 2, 50)
E_mean, S_mean, E_std, S_std = get_stats_with_uncertainty(arr, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
new_mean, new_std = get_random_stats(E_mean, S_mean, E_std, S_std)
new_mean2, new_std2 = get_random_stats(E_mean, S_mean, E_std, S_std)
test_ne(new_mean, new_mean2)
test_ne(new_std, new_std2)
test_eq(new_mean.shape, (1, 2, 1))
test_eq(new_std.shape, (1, 2, 1))
new_mean, new_std

100.00% [15/15 00:00<00:00]

(array([[[0.49649504],
         [0.49636062]]]),
 array([[[0.28626438],
         [0.28665599]]]))

TSGaussianStandardize can be used jointly with TSStandardized in the following way:

X, y, splits = get_UCR_data('LSST', split_data=False)
tfms = [None, TSClassification()]
E_mean, S_mean, E_std, S_std = get_stats_with_uncertainty(X, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
batch_tfms = [TSGaussianStandardize(E_mean, S_mean, E_std, S_std, split_idx=0), TSStandardize(E_mean, S_mean, split_idx=1)]
dls = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[32, 64])
learn = ts_learner(dls, InceptionTimePlus, metrics=accuracy, cbs=[ShowGraph()])
learn.fit_one_cycle(1, 1e-2)

In this way the train batches are scaled based on mean and standard deviation distributions while the valid batches are scaled with a fixed mean and standard deviation values.

The intent is to improve out-of-distribution performance. This method is inspired by UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958.

source

TSDiff

 TSDiff (lag=1, pad=True, **kwargs)

Differences batch of type TSTensor

t = TSTensor(torch.arange(24).reshape(2,3,4))
test_eq(TSDiff()(t)[..., 1:].float().mean(), 1)
test_eq(TSDiff(lag=2, pad=False)(t).float().mean(), 2)

source

TSLog

 TSLog (ex=None, **kwargs)

Log transforms batch of type TSTensor + 1. Accepts positive and negative numbers

t = TSTensor(torch.rand(2,3,4)) * 2 - 1 
tfm = TSLog()
enc_t = tfm(t)
test_ne(enc_t, t)
test_close(tfm.decodes(enc_t).data, t.data)

source

TSCyclicalPosition

 TSCyclicalPosition (cyclical_var=None, magnitude=None, drop_var=False,
                     **kwargs)

Concatenates the position along the sequence as 2 additional variables (sine and cosine)

	Type	Default	Details
cyclical_var	NoneType	None	Optional variable to indicate the steps withing the cycle (ie minute of the day)
magnitude	NoneType	None	Added for compatibility. It’s not used.
drop_var	bool	False	Flag to indicate if the cyclical var is removed
kwargs

bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
enc_t = TSCyclicalPosition()(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1] - 2
plt.plot(enc_t[0, -2:].cpu().numpy().T)
plt.show()

bs, c_in, seq_len = 1,3,100
t1 = torch.rand(bs, c_in, seq_len)
t2 = torch.arange(seq_len)
t2 = torch.cat([t2[35:], t2[:35]]).reshape(1, 1, -1)
t = TSTensor(torch.cat([t1, t2], 1))
mask = torch.rand_like(t) > .8
t[mask] = np.nan
enc_t = TSCyclicalPosition(3)(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1] - 2
plt.plot(enc_t[0, -2:].cpu().numpy().T)
plt.show()

source

TSLinearPosition

 TSLinearPosition (linear_var:int=None, var_range:tuple=None,
                   magnitude=None, drop_var:bool=False,
                   lin_range:tuple=(-1, 1), **kwargs)

Concatenates the position along the sequence as 1 additional variable

	Type	Default	Details
linear_var	int	None	Optional variable to indicate the steps withing the cycle (ie minute of the day)
var_range	tuple	None	Optional range indicating min and max values of the linear variable
magnitude	NoneType	None	Added for compatibility. It’s not used.
drop_var	bool	False	Flag to indicate if the cyclical var is removed
lin_range	tuple	(-1, 1)
kwargs

bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
enc_t = TSLinearPosition()(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1] - 1
plt.plot(enc_t[0, -1].cpu().numpy().T)
plt.show()

t = torch.arange(100)
t1 = torch.cat([t[30:], t[:30]]).reshape(1, 1, -1)
t2 = torch.cat([t[52:], t[:52]]).reshape(1, 1, -1)
t = torch.cat([t1, t2]).float()
mask = torch.rand_like(t) > .8
t[mask] = np.nan
t = TSTensor(t)
enc_t = TSLinearPosition(linear_var=0, var_range=(0, 100), drop_var=True)(t)
test_ne(enc_t, t)
assert t.shape[1] == enc_t.shape[1]
plt.plot(enc_t[0, -1].cpu().numpy().T)
plt.show()

source

TSMissingness

 TSMissingness (sel_vars=None, feature_idxs=None, magnitude=None,
                **kwargs)

Concatenates data missingness for selected features along the sequence as additional variables

bs, c_in, seq_len = 1,3,100
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t>.5] = np.nan
enc_t = TSMissingness(sel_vars=[0,2])(t)
test_eq(enc_t.shape[1], 5)
test_eq(enc_t[:, 3:], torch.isnan(t[:, [0,2]]).float())

source

TSPositionGaps

 TSPositionGaps (sel_vars=None, feature_idxs=None, magnitude=None,
                 forward=True, backward=False, nearest=False,
                 normalize=True, **kwargs)

Concatenates gaps for selected features along the sequence as additional variables

bs, c_in, seq_len = 1,3,8
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t>.5] = np.nan
enc_t = TSPositionGaps(sel_vars=[0,2], forward=True, backward=True, nearest=True, normalize=False)(t)
test_eq(enc_t.shape[1], 9)
enc_t.data

tensor([[[0.2875, 0.0553,    nan,    nan, 0.1478, 0.1234, 0.0835, 0.1465],
         [   nan,    nan, 0.3967,    nan, 0.0654,    nan, 0.2289, 0.1094],
         [0.3820, 0.1613, 0.4825, 0.1379,    nan,    nan, 0.3000, 0.4673],
         [1.0000, 1.0000, 1.0000, 2.0000, 3.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 2.0000, 3.0000, 1.0000],
         [1.0000, 3.0000, 2.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 3.0000, 2.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]]])

source

TSRollingMean

 TSRollingMean (sel_vars=None, feature_idxs=None, magnitude=None,
                window=2, replace=False, **kwargs)

Calculates the rolling mean for all/ selected features alongside the sequence

It replaces the original values or adds additional variables (default) If nan values are found, they will be filled forward and backward

bs, c_in, seq_len = 1,3,8
t = TSTensor(torch.rand(bs, c_in, seq_len))
t[t > .6] = np.nan
print(t.data)
enc_t = TSRollingMean(sel_vars=[0,2], window=3)(t)
test_eq(enc_t.shape[1], 5)
print(enc_t.data)
enc_t = TSRollingMean(window=3, replace=True)(t)
test_eq(enc_t.shape[1], 3)
print(enc_t.data)

tensor([[[   nan, 0.3836,    nan,    nan, 0.0237, 0.4363,    nan, 0.1834],
         [0.2749, 0.5018,    nan, 0.4008, 0.2797, 0.4010, 0.4323, 0.3692],
         [0.4013,    nan, 0.1272, 0.2202, 0.4324, 0.3293, 0.5350, 0.3919]]])
tensor([[[0.3836, 0.3836, 0.3836, 0.3836, 0.0237, 0.4363, 0.4363, 0.1834],
         [0.2749, 0.5018,    nan, 0.4008, 0.2797, 0.4010, 0.4323, 0.3692],
         [0.4013, 0.4013, 0.1272, 0.2202, 0.4324, 0.3293, 0.5350, 0.3919],
         [0.3836, 0.3836, 0.3836, 0.3836, 0.2636, 0.2812, 0.2988, 0.3520],
         [0.4013, 0.4013, 0.3099, 0.2496, 0.2599, 0.3273, 0.4322, 0.4187]]])
tensor([[[0.3836, 0.3836, 0.3836, 0.3836, 0.2636, 0.2812, 0.2988, 0.3520],
         [0.2749, 0.3883, 0.4261, 0.4681, 0.3941, 0.3605, 0.3710, 0.4008],
         [0.4013, 0.4013, 0.3099, 0.2496, 0.2599, 0.3273, 0.4322, 0.4187]]])

source

TSLogReturn

 TSLogReturn (lag=1, pad=True, **kwargs)

Calculates log-return of batch of type TSTensor. For positive values only

t = TSTensor([1,2,4,8,16,32,64,128,256]).float()
test_eq(TSLogReturn(pad=False)(t).std(), 0)

source

TSAdd

 TSAdd (add, **kwargs)

Add a defined amount to each batch of type TSTensor.

t = TSTensor([1,2,3]).float()
test_eq(TSAdd(1)(t), TSTensor([2,3,4]).float())

source

TSClipByVar

 TSClipByVar (var_min_max, **kwargs)

Clip batch of type TSTensor by variable

Args: var_min_max: list of tuples containing variable index, min value (or None) and max value (or None)

t = TSTensor(torch.rand(16, 3, 10) * tensor([1,10,100]).reshape(1,-1,1))
max_values = t.max(0).values.max(-1).values.data
max_values2 = TSClipByVar([(1,None,5), (2,10,50)])(t).max(0).values.max(-1).values.data
test_le(max_values2[1], 5)
test_ge(max_values2[2], 10)
test_le(max_values2[2], 50)

source

TSDropVars

 TSDropVars (drop_vars, **kwargs)

Drops selected variable from the input

t = TSTensor(torch.arange(24).reshape(2, 3, 4))
enc_t = TSDropVars(2)(t)
test_ne(t, enc_t)
enc_t.data

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19]]])

source

TSOneHotEncode

 TSOneHotEncode (sel_var:int, unique_labels:list, add_na:bool=False,
                 drop_var:bool=True, magnitude=None, **kwargs)

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

	Type	Default	Details
sel_var	int		Variable that is one-hot encoded
unique_labels	list		List containing all labels (excluding nan values)
add_na	bool	False	Flag to indicate if values not included in vocab should be set as 0
drop_var	bool	True	Flag to indicate if the selected var is removed
magnitude	NoneType	None	Added for compatibility. It’s not used.
kwargs

bs = 2
seq_len = 5
t_cont = torch.rand(bs, 1, seq_len)
t_cat = torch.randint(0, 3, t_cont.shape)
t = TSTensor(torch.cat([t_cat, t_cont], 1))
t_cat

tensor([[[0, 2, 1, 0, 2]],

        [[0, 0, 1, 1, 2]]])

tfm = TSOneHotEncode(0, [0, 1, 2])
output = tfm(t)[:, -3:].data
test_eq(t_cat, torch.argmax(tfm(t)[:, -3:], 1)[:, None])
tfm(t)[:, -3:].data

tensor([[[1., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0.],
         [0., 1., 0., 0., 1.]],

        [[1., 1., 0., 0., 0.],
         [0., 0., 1., 1., 0.],
         [0., 0., 0., 0., 1.]]])

bs = 2
seq_len = 5
t_cont = torch.rand(bs, 1, seq_len)
t_cat = torch.tensor([[10.,  5., 11., np.nan, 12.], [ 5., 12., 10., np.nan, 11.]])[:, None]
t = TSTensor(torch.cat([t_cat, t_cont], 1))
t_cat

tensor([[[10.,  5., 11., nan, 12.]],

        [[ 5., 12., 10., nan, 11.]]])

tfm = TSOneHotEncode(0, [10, 11, 12], drop_var=False)
mask = ~torch.isnan(t[:, 0])
test_eq(tfm(t)[:, 0][mask], t[:, 0][mask])
tfm(t)[:, -3:].data

tensor([[[1., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1.]],

        [[0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0.]]])

t1 = torch.randint(3, 7, (2, 1, 10))
t2 = torch.rand(2, 1, 10)
t = TSTensor(torch.cat([t1, t2], 1))
output = TSOneHotEncode(0, [3, 4, 5], add_na=True, drop_var=True)(t)
test_eq((t1 > 5).float(), output.data[:, [1]])
test_eq((t1 == 3).float(), output.data[:, [2]])
test_eq((t1 == 4).float(), output.data[:, [3]])
test_eq((t1 == 5).float(), output.data[:, [4]])
test_eq(output.shape, (t.shape[0], 5, t.shape[-1]))

source

TSPosition

 TSPosition (steps:list, magnitude=None, **kwargs)

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

	Type	Default	Details
steps	list		List containing the steps passed as an additional variable. Theu should be normalized.
magnitude	NoneType	None	Added for compatibility. It’s not used.
kwargs

t = TSTensor(torch.rand(2, 1, 10)).float()
a = np.linspace(-1, 1, 10).astype('float64')
TSPosition(a)(t).data.dtype, t.dtype

(torch.float32, torch.float32)

source

PatchEncoder

 PatchEncoder (patch_len:int, patch_stride:int=None,
               pad_at_start:bool=True, value:float=0.0, seq_len:int=None,
               merge_dims:bool=True, reduction:str='none',
               reduction_dim:int=-1, swap_dims:tuple=None)

Creates a sequence of patches from a 3d input tensor.

	Type	Default	Details
patch_len	int		Number of time steps in each patch.
patch_stride	int	None	Stride of the patch.
pad_at_start	bool	True	If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
value	float	0.0	Value to pad the input tensor with.
seq_len	int	None	Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
merge_dims	bool	True	If True, merge channels within the same patch.
reduction	str	none	type of reduction applied. Available: “none”, “mean”, “min”, “max”, “mode”
reduction_dim	int	-1	dimension where the reduction is applied
swap_dims	tuple	None	If True, swap the time and channel dimensions.

seq_len = 17
patch_len = 10
patch_stride = 5

z11 = torch.arange(seq_len).reshape(1, 1, -1)
z12 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z1 = torch.cat((z11, z12), dim=1)
z21 = torch.arange(seq_len).reshape(1, 1, -1)
z22 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z2 = torch.cat((z21, z22), dim=1) + 1
z31 = torch.arange(seq_len).reshape(1, 1, -1)
z32 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z3 = torch.cat((z31, z32), dim=1) + 2
z = torch.cat((z11, z21, z31), dim=0)
z = torch.cat((z1, z2, z3), dim=0)
print(z.shape, "\n")
print(z)

patch_encoder = PatchEncoder(patch_len=patch_len, patch_stride=patch_stride, value=-1, seq_len=seq_len, merge_dims=True)
output = patch_encoder(z)
print(output.shape, "\n")
first_token = output[..., 0]
expected_first_token = torch.tensor([[-1, -1, -1,  0,  1,  2,  3,  4,  5,  6, -1, -1, -1,  0, 10, 20, 30, 40,
         50, 60],
        [-1, -1, -1,  1,  2,  3,  4,  5,  6,  7, -1, -1, -1,  1, 11, 21, 31, 41,
         51, 61],
        [-1, -1, -1,  2,  3,  4,  5,  6,  7,  8, -1, -1, -1,  2, 12, 22, 32, 42,
         52, 62]])
test_eq(first_token, expected_first_token)

torch.Size([3, 2, 17]) 

tensor([[[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
           14,  15,  16],
         [  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
          140, 150, 160]],

        [[  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
           15,  16,  17],
         [  1,  11,  21,  31,  41,  51,  61,  71,  81,  91, 101, 111, 121, 131,
          141, 151, 161]],

        [[  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
           16,  17,  18],
         [  2,  12,  22,  32,  42,  52,  62,  72,  82,  92, 102, 112, 122, 132,
          142, 152, 162]]])
torch.Size([3, 20, 3])

source

TSPatchEncoder

 TSPatchEncoder (patch_len:int, patch_stride:int=None,
                 pad_at_start:bool=True, value:float=0.0,
                 seq_len:int=None, merge_dims:bool=True,
                 reduction:str='none', reduction_dim:int=-2,
                 swap_dims:tuple=None)

Tansforms a time series into a sequence of patches along the last dimension

	Type	Default	Details
patch_len	int		Number of time steps in each patch.
patch_stride	int	None	Stride of the patch.
pad_at_start	bool	True	If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
value	float	0.0	Value to pad the input tensor with.
seq_len	int	None	Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
merge_dims	bool	True	If True, merge channels within the same patch.
reduction	str	none	type of reduction applied. Available: “none”, “mean”, “min”, “max”, “mode”
reduction_dim	int	-2	dimension where the y reduction is applied.
swap_dims	tuple	None	If True, swap the time and channel dimensions.

bs = 2
c_in = 1
seq_len = 10
patch_len = 4

t = TSTensor(torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len))
print(t.data)
print(t.shape, "\n")

patch_encoder = TSPatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len)
output = patch_encoder(t)
test_eq(output.shape, ([bs, patch_len, 7]))
print("first patch:\n", output[..., 0].data, "\n")

patch_encoder = TSPatchEncoder(patch_len=patch_len, patch_stride=None, seq_len=seq_len)
output = patch_encoder(t)
test_eq(output.shape, ([bs, patch_len, 3]))
print("first patch:\n", output[..., 0].data, "\n")

tensor([[[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9]],

        [[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]])
torch.Size([2, 1, 10]) 

first patch:
 tensor([[ 0,  1,  2,  3],
        [10, 11, 12, 13]]) 

first patch:
 tensor([[ 0,  0,  0,  1],
        [ 0,  0, 10, 11]])

source

TSTuplePatchEncoder

 TSTuplePatchEncoder (patch_len:int, patch_stride:int=None,
                      pad_at_start:bool=True, value:float=0.0,
                      seq_len:int=None, merge_dims:bool=True,
                      reduction:str='none', reduction_dim:int=-2,
                      swap_dims:tuple=None)

Tansforms a time series with x and y into sequences of patches along the last dimension

	Type	Default	Details
patch_len	int		Number of time steps in each patch.
patch_stride	int	None	Stride of the patch.
pad_at_start	bool	True	If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
value	float	0.0	Value to pad the input tensor with.
seq_len	int	None	Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
merge_dims	bool	True	If True, merge y channels within the same patch.
reduction	str	none	type of reduction applied to y. Available: “none”, “mean”, “min”, “max”, “mode”
reduction_dim	int	-2	dimension where the y reduction is applied.
swap_dims	tuple	None	If True, swap the time and channel dimensions in y.

# test
bs = 2
c_in = 2
seq_len = 10
patch_len = 4

x = torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len)
y = torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len) * 10
print(x)
print(y)


patch_encoder = TSTuplePatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len, merge_dims=True)
x_out, y_out = patch_encoder((x, y))
test_eq(x_out.shape, ([bs, c_in * patch_len, 7]))
test_eq(y_out.shape, ([bs, c_in * patch_len, 7]))
print("first x patch:\n", x_out[..., 0].data, "\n")
print("first y patch:\n", y_out[..., 0].data, "\n")

patch_encoder = TSTuplePatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len, merge_dims=False, reduction="max")
x_out, y_out = patch_encoder((x, y))
test_eq(x_out.shape, ([bs, c_in * patch_len, 7]))
test_eq(y_out.shape, ([bs, c_in, 7]))
print("first x patch:\n", x_out[..., 0].data, "\n")
print("first y patch:\n", y_out[..., 0].data, "\n")

tensor([[[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
         [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]],

        [[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
         [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]]])
tensor([[[  0,  10,  20,  30,  40,  50,  60,  70,  80,  90],
         [100, 110, 120, 130, 140, 150, 160, 170, 180, 190]],

        [[200, 210, 220, 230, 240, 250, 260, 270, 280, 290],
         [300, 310, 320, 330, 340, 350, 360, 370, 380, 390]]])
first x patch:
 tensor([[ 0,  1,  2,  3, 10, 11, 12, 13],
        [20, 21, 22, 23, 30, 31, 32, 33]]) 

first y patch:
 tensor([[  0,  10,  20,  30, 100, 110, 120, 130],
        [200, 210, 220, 230, 300, 310, 320, 330]]) 

first x patch:
 tensor([[ 0,  1,  2,  3, 10, 11, 12, 13],
        [20, 21, 22, 23, 30, 31, 32, 33]]) 

first y patch:
 tensor([[ 30, 130],
        [230, 330]])

sklearn API transforms

source

object2date

 object2date (x, format=None)

source

TSShrinkDataFrame

 TSShrinkDataFrame (columns=None, skip=None, obj2cat=True, int2uint=False,
                    verbose=True)

A transformer to shrink dataframe or series memory usage

	Type	Default	Details
columns	NoneType	None	List[str], optional. Columns to shrink, all columns by default.
skip	NoneType	None	List[str], optional. Columns to skip, None by default.
obj2cat	bool	True	bool, optional. Convert object columns to category, True by default.
int2uint	bool	False	bool, optional. Convert int columns to uint, False by default.
verbose	bool	True	bool, optional. Print memory usage info. True by default.

df = pd.DataFrame()
df["ints64"] = np.random.randint(0,3,10)
df['floats64'] = np.random.rand(10)
tfm = TSShrinkDataFrame()
df = tfm.fit_transform(df)
test_eq(df["ints64"].dtype, "int8")
test_eq(df["floats64"].dtype, "float32")

Initial memory usage: 288.00 B  
Final memory usage  : 178.00 B   (-38.2%)

# test with date
df = pd.DataFrame()
df["dates"] = pd.date_range('1/1/2011', periods=10, freq='M').astype(str)
df["ints64"] = np.random.randint(0,3,10)
df['floats64'] = np.random.rand(10)
tfm = TSShrinkDataFrame()
df = tfm.fit_transform(df)
test_eq(df["dates"].dtype, "datetime64[ns]")
test_eq(df["ints64"].dtype, "int8")
test_eq(df["floats64"].dtype, "float32")

Initial memory usage: 368.00 B  
Final memory usage  : 258.00 B   (-29.9%)

# test with date and series
df = pd.DataFrame()
df["dates"] = pd.date_range('1/1/2011', periods=10, freq='M').astype(str)
tfm = TSShrinkDataFrame()
df = tfm.fit_transform(df["dates"])
test_eq(df.dtype, "datetime64[ns]")

Initial memory usage: 208.00 B  
Final memory usage  : 208.00 B   (0.0%)

source

TSOneHotEncoder

 TSOneHotEncoder (columns=None, drop=True, add_na=True, dtype=<class
                  'numpy.int8'>)

Encode categorical variables using one-hot encoding

	Type	Default	Details
columns	NoneType	None	(str or List[str], optional): Column name(s) to encode. If None, all columns will be encoded. Defaults to None.
drop	bool	True	(bool, optional): Whether to drop the original columns after encoding. Defaults to True.
add_na	bool	True	(bool, optional): Whether to add a ‘NaN’ category for missing values. Defaults to True.
dtype	type	int8	(type, optional): Data type of the encoded output. Defaults to np.int64.

df = pd.DataFrame()
df["a"] = np.random.randint(0,2,10)
df["b"] = np.random.randint(0,3,10)
unique_cols = len(df["a"].unique()) + len(df["b"].unique())
tfm = TSOneHotEncoder()
tfm.fit(df)
df = tfm.transform(df)
test_eq(df.shape[1], unique_cols)

source

TSCategoricalEncoder

 TSCategoricalEncoder (columns=None, add_na=True, sort=True,
                       categories='auto', inplace=True, prefix=None,
                       suffix=None, drop=False)

A transformer to encode categorical columns

	Type	Default	Details
columns	NoneType	None	List[str], optional. Columns to encode, all columns by default.
add_na	bool	True	bool, optional. Add a NaN category, True by default.
sort	bool	True	bool, optional. Sort categories by frequency, True by default.
categories	str	auto	dict, optional. The custom mapping of categories. ‘auto’ by default.
inplace	bool	True	bool, optional. Modify input DataFrame, True by default.
prefix	NoneType	None	str, optional. Prefix for created column names. None by default.
suffix	NoneType	None	str, optional. Suffix for created column names. None by default.
drop	bool	False	bool, optional. Drop original columns, False by default.

Stateful transforms like TSCategoricalEncoder can easily be serialized.

import joblib

df = pd.DataFrame()
df["a"] = alphabet[np.random.randint(0,2,100)]
df["b"] = ALPHABET[np.random.randint(0,3,100)]
display(df)
a_unique = len(df["a"].unique())
b_unique = len(df["b"].unique())
tfm = TSCategoricalEncoder()
tfm.fit(df, idxs=slice(0, 50))
joblib.dump(tfm, "data/TSCategoricalEncoder.joblib")
tfm = joblib.load("data/TSCategoricalEncoder.joblib")
df.loc[0, "a"] = 'z'
df.loc[1, "a"] = 'h'
df = tfm.transform(df)
display(df)
test_eq(df['a'].max(), a_unique)
test_eq(df['b'].max(), b_unique)
df = tfm.inverse_transform(df)
display(df)

	a	b
0	b	B
1	b	A
2	b	C
3	a	C
4	b	C
...	...	...
95	a	A
96	a	A
97	a	B
98	a	A
99	b	B

100 rows × 2 columns

	a	b
0	0	2
1	0	1
2	2	3
3	1	3
4	2	3
...	...	...
95	1	1
96	1	1
97	1	2
98	1	1
99	2	2

100 rows × 2 columns

	a	b
0	#na#	B
1	#na#	A
2	b	C
3	a	C
4	b	C
...	...	...
95	a	A
96	a	A
97	a	B
98	a	A
99	b	B

100 rows × 2 columns

df = pd.DataFrame()
df["a"] = alphabet[np.random.randint(0,2,100)]
df["a"] = df["a"].astype('category')
df["b"] = ALPHABET[np.random.randint(0,3,100)]
display(df)
a_unique = len(df["a"].unique())
b_unique = len(df["b"].unique())
tfm = TSCategoricalEncoder()
tfm.fit(df)
joblib.dump(tfm, "data/TSCategoricalEncoder.joblib")
tfm = joblib.load("data/TSCategoricalEncoder.joblib")
df["a"] = alphabet[np.random.randint(0,5,100)]
df["a"] = df["a"].astype('category')
df["b"] = ALPHABET[np.random.randint(0,3,100)]
display(df)
df = tfm.transform(df)
display(df)
test_eq(df['a'].max(), a_unique)
test_eq(df['b'].max(), b_unique)
df = tfm.inverse_transform(df)
display(df)

	a	b
0	b	B
1	a	C
2	b	C
3	a	C
4	b	B
...	...	...
95	b	A
96	b	A
97	a	A
98	b	B
99	b	B

100 rows × 2 columns

	a	b
0	d	A
1	a	A
2	c	A
3	a	A
4	a	B
...	...	...
95	c	C
96	d	B
97	c	A
98	b	B
99	e	B

100 rows × 2 columns

	a	b
0	0	1
1	1	1
2	0	1
3	1	1
4	1	2
...	...	...
95	0	3
96	0	2
97	0	1
98	2	2
99	0	2

100 rows × 2 columns

	a	b
0	#na#	A
1	a	A
2	#na#	A
3	a	A
4	a	B
...	...	...
95	#na#	C
96	#na#	B
97	#na#	A
98	b	B
99	#na#	B

100 rows × 2 columns

df = pd.DataFrame()
df["a"] = alphabet[np.random.randint(0,2,100)]
df["a"] = df["a"].astype('category')
s = df['a']
display(s)
tfm = TSCategoricalEncoder()
tfm.fit(s)
joblib.dump(tfm, "data/TSCategoricalEncoder.joblib")
tfm = joblib.load("data/TSCategoricalEncoder.joblib")
s = tfm.transform(s)
display(s)
s = tfm.inverse_transform(s)
display(s)

0     a
1     b
2     a
3     a
4     a
     ..
95    a
96    a
97    a
98    a
99    b
Name: a, Length: 100, dtype: category
Categories (2, object): ['a', 'b']

0     1
1     2
2     1
3     1
4     1
     ..
95    1
96    1
97    1
98    1
99    2
Length: 100, dtype: int8

0     a
1     b
2     a
3     a
4     a
     ..
95    a
96    a
97    a
98    a
99    b
Length: 100, dtype: object

source

TSTargetEncoder

 TSTargetEncoder (target_column, columns=None, inplace=True, prefix=None,
                  suffix=None, drop=True, dtypes=['object', 'category'])

Mixin class for all transformers in scikit-learn.

If :term:get_feature_names_out is defined, then :class:BaseEstimator will automatically wrap transform and fit_transform to follow the set_output API. See the :ref:developer_api_set_output for details.

class:`OneToOneFeatureMixin` and :class:`ClassNamePrefixFeaturesOutMixin` are helpful mixins for defining :term:`get_feature_names_out`.
	Type	Default	Details
target_column			column containing the target
columns	NoneType	None	List[str], optional. Columns to encode, all non-numerical columns by default.
inplace	bool	True	bool, optional. Modify input DataFrame, True by default.
prefix	NoneType	None	str, optional. Prefix for created column names. None by default.
suffix	NoneType	None	str, optional. Suffix for created column names. None by default.
drop	bool	True	bool, optional. Drop original columns, False by default.
dtypes	list	[‘object’, ‘category’]	List[str]. List with dtypes that will be used to identify columns to encode if not explicitly passed.

from sklearn.model_selection import train_test_split

# Create a dataframe with 100 rows
np.random.seed(42)
df = pd.DataFrame({
    'category1': np.random.choice(['cat', 'dog', 'rabbit'], 100),
    'category2': np.random.choice(['large', 'small'], 100),
    'continuous': np.random.rand(100),
    'target': np.random.randint(0, 2, 100)
})

display(df)

# Split the data into train and test sets
train_idx, test_idx = train_test_split(np.arange(100), test_size=0.2, random_state=42)
print(train_idx.shape)

# Initialize the encoder
encoder = TSTargetEncoder(columns=['category1', 'category2'], target_column='target', inplace=False, suffix="te", drop=False)

# Fit the encoder using the training data
encoder.fit(df, idxs=train_idx)

# Transform the whole dataframe
df_encoded = encoder.transform(df)

# Check the results
for c in ["category1", "category2"]:
    for v in df[c].unique():
        assert df.loc[train_idx][df.loc[train_idx, c] == v]["target"].mean() == df_encoded[df_encoded[c] == v][f"{c}_te"].mean()
        
df_encoded

	category1	category2	continuous	target
0	rabbit	small	0.896091	0
1	cat	small	0.318003	1
2	rabbit	small	0.110052	1
3	rabbit	large	0.227935	0
4	cat	large	0.427108	0
...	...	...	...	...
95	cat	small	0.325400	0
96	cat	large	0.746491	0
97	rabbit	small	0.649633	1
98	cat	small	0.849223	0
99	cat	large	0.657613	1

100 rows × 4 columns

(80,)

	category1	category2	continuous	target	category1_te	category2_te
0	rabbit	small	0.896091	0	0.565217	0.500000
1	cat	small	0.318003	1	0.555556	0.500000
2	rabbit	small	0.110052	1	0.565217	0.500000
3	rabbit	large	0.227935	0	0.565217	0.521739
4	cat	large	0.427108	0	0.555556	0.521739
...	...	...	...	...	...	...
95	cat	small	0.325400	0	0.555556	0.500000
96	cat	large	0.746491	0	0.555556	0.521739
97	rabbit	small	0.649633	1	0.565217	0.500000
98	cat	small	0.849223	0	0.555556	0.500000
99	cat	large	0.657613	1	0.555556	0.521739

100 rows × 6 columns

source

TSDateTimeEncoder

 TSDateTimeEncoder (datetime_columns=None, prefix=None, drop=True,
                    time=False, attr=['Year', 'Month', 'Week', 'Day',
                    'Dayofweek', 'Dayofyear', 'Is_month_end',
                    'Is_month_start', 'Is_quarter_end',
                    'Is_quarter_start', 'Is_year_end', 'Is_year_start'])

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

import datetime as dt

df = pd.DataFrame()
df.loc[0, "date"] = dt.datetime.now()
df.loc[1, "date"] = dt.datetime.now() + pd.Timedelta(1, unit="D")
tfm = TSDateTimeEncoder()
joblib.dump(tfm, "data/TSDateTimeEncoder.joblib")
tfm = joblib.load("data/TSDateTimeEncoder.joblib")
tfm.fit_transform(df)

	_Year	_Month	_Week	_Day	_Dayofweek	_Dayofyear	_Is_month_end	_Is_month_start	_Is_quarter_end	_Is_quarter_start	_Is_year_end	_Is_year_start
0	2023	6	24	17	5	168	False	False	False	False	False	False
1	2023	6	24	18	6	169	False	False	False	False	False	False

source

TSDropIfTrueCols

 TSDropIfTrueCols (columns=None)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

# test TSDropIfTrueCols
df = pd.DataFrame()
df["a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]

expected_output = pd.DataFrame()
expected_output["b"] = [0, 0, 0, 0]
expected_output["c"] = [0, 1, 0, 1]

tfm = TSDropIfTrueCols("a")
output = tfm.fit_transform(df)
test_eq(output, expected_output),

(None,)

source

TSApplyFunction

 TSApplyFunction (function, groups=None, group_keys=False, axis=1,
                  columns=None, reset_index=False, drop=True)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

df = pd.DataFrame()
df["a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]

df.apply(lambda x: 1, )

a    1
b    1
c    1
dtype: int64

# test ApplyFunction without groups
df = pd.DataFrame()
df["a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]

expected_output = pd.Series([1,1,1])

tfm = TSApplyFunction(lambda x: 1, axis=0, reset_index=True)
output = tfm.fit_transform(df)
test_eq(output, expected_output)

# test ApplyFunction with groups and square function
df = pd.DataFrame()
df["a"] = [0, 1, 2, 3, 4]
df["id"] = [0, 0, 0, 1, 1]

expected_output = pd.Series([5, 25])

tfm = TSApplyFunction(lambda x: (x["a"]**2).sum(), groups="id")

output = tfm.fit_transform(df)
test_eq(output, expected_output)

source

TSMissingnessEncoder

 TSMissingnessEncoder (columns=None)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

data = np.random.rand(10,3)
data[data > .8] = np.nan
df = pd.DataFrame(data, columns=["a", "b", "c"])
tfm = TSMissingnessEncoder()
tfm.fit(df)
joblib.dump(tfm, "data/TSMissingnessEncoder.joblib")
tfm = joblib.load("data/TSMissingnessEncoder.joblib")
df = tfm.transform(df)
df

	a	b	c	a_missing	b_missing	c_missing
0	NaN	NaN	NaN	1	1	1
1	0.511342	0.501516	0.798295	0	0	0
2	0.649964	0.701967	0.795793	0	0	0
3	NaN	0.337995	0.375583	1	0	0
4	0.093982	0.578280	0.035942	0	0	0
5	0.465598	0.542645	0.286541	0	0	0
6	0.590833	0.030500	0.037348	0	0	0
7	NaN	0.360191	0.127061	1	0	0
8	0.522243	0.769994	0.215821	0	0	0
9	0.622890	0.085347	0.051682	0	0	0

source

TSSortByColumns

 TSSortByColumns (columns, ascending=True, inplace=True, kind='stable',
                  na_position='last', ignore_index=False, key=None)

Transforms a dataframe by sorting by columns.

	Type	Default	Details
columns			Columns to sort by
ascending	bool	True	Ascending or descending
inplace	bool	True	Perform operation in place
kind	str	stable	Type of sort to use
na_position	str	last	Where to place NaNs
ignore_index	bool	False	Do not preserve index
key	NoneType	None	Function to apply to values before sorting

# Test
df = pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df_ori = df.copy()
tfm = TSSortByColumns(["a", "b"])
df = tfm.fit_transform(df)
test_eq(df_ori.sort_values(["a", "b"]).values, df.values)

source

TSSelectColumns

 TSSelectColumns (columns)

Transform used to select columns

	Details
columns	str or List[str]. Selected columns.

# Test
df = pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df_ori = df.copy()
tfm = TSSelectColumns(["a", "b"])
df = tfm.fit_transform(df)
test_eq(df_ori[["a", "b"]].values, df.values)
df = tfm.inverse_transform(df)

source

TSStepsSinceStart

 TSStepsSinceStart (datetime_col, datetime_unit='D', start_datetime=None,
                    drop=False, dtype=None)

Add a column indicating the number of steps since the start in each row

# Test
df = pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df["datetime"] = pd.date_range("2020-01-01", periods=10)
display(df)
df_ori = df.copy()
tfm = TSStepsSinceStart("datetime", datetime_unit="D", drop=True, dtype=np.int32)
df = tfm.fit_transform(df)
display(df)
test_eq(df["days_since_start"].values, np.arange(10))
df = tfm.inverse_transform(df)
test_eq(df_ori.values, df.values)

	a	b	c	datetime
0	0.643288	0.458253	0.545617	2020-01-01
1	0.941465	0.386103	0.961191	2020-01-02
2	0.905351	0.195791	0.069361	2020-01-03
3	0.100778	0.018222	0.094443	2020-01-04
4	0.683007	0.071189	0.318976	2020-01-05
5	0.844875	0.023272	0.814468	2020-01-06
6	0.281855	0.118165	0.696737	2020-01-07
7	0.628943	0.877472	0.735071	2020-01-08
8	0.803481	0.282035	0.177440	2020-01-09
9	0.750615	0.806835	0.990505	2020-01-10

	a	b	c	days_since_start
0	0.643288	0.458253	0.545617	0
1	0.941465	0.386103	0.961191	1
2	0.905351	0.195791	0.069361	2
3	0.100778	0.018222	0.094443	3
4	0.683007	0.071189	0.318976	4
5	0.844875	0.023272	0.814468	5
6	0.281855	0.118165	0.696737	6
7	0.628943	0.877472	0.735071	7
8	0.803481	0.282035	0.177440	8
9	0.750615	0.806835	0.990505	9

source

TSStandardScaler

 TSStandardScaler (columns=None, mean=None, std=None, eps=1e-06)

Scale the values of specified columns in the input DataFrame to have a mean of 0 and standard deviation of 1.

	Type	Default	Details
columns	NoneType	None	Column name(s) to be transformed. If None, all columns are transformed. Defaults to None.
mean	NoneType	None	Mean value for each column. If None, the mean value of each column is calculated during the fit method. Defaults to None.
std	NoneType	None	Stdev value for each column. If None, the standard deviation value of each column is calculated during the fit method. Defaults to None.
eps	float	1e-06	A small value to avoid division by zero. Defaults to 1e-6.

# Test
df = pd.DataFrame(np.random.rand(100,3), columns=["a", "b", "c"])
tfm = TSStandardScaler()
df = tfm.fit_transform(df)
test_close(df.mean().values, np.zeros(3), 1e-3)
test_close(df.std().values, np.ones(3), 1e-3)

# Test
df = pd.DataFrame(np.random.rand(1000,3), columns=["a", "b", "c"])
tfm = TSStandardScaler()
df = tfm.fit_transform(df, idxs=slice(0, 800))
test_close(df.mean().values, np.zeros(3), 1e-1)
test_close(df.std().values, np.ones(3), 1e-1)

source

TSRobustScaler

 TSRobustScaler (columns=None, quantile_range=(25.0, 75.0), eps=1e-06)

This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)

# test RobustScaler
df = pd.DataFrame(np.random.rand(100,3), columns=["a", "b", "c"])
df["a"] = df["a"] * 100
df["b"] = df["b"] * 10
tfm = TSRobustScaler()
df = tfm.fit_transform(df)
test_close(df.median().values, np.zeros(3), 1e-3)

source

TSAddMissingTimestamps

 TSAddMissingTimestamps (datetime_col=None, use_index=False,
                         unique_id_cols=None, fill_value=nan,
                         range_by_group=True, start_date=None,
                         end_date=None, freq=None)

Mixin class for all transformers in scikit-learn.

:class:OneToOneFeatureMixin and :class:ClassNamePrefixFeaturesOutMixin are helpful mixins for defining :term:get_feature_names_out.

# Test
df = pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df["datetime"] = pd.date_range("2020-01-01", periods=10)
df = df.iloc[[0, 2, 3, 5, 6, 8, 9]]
display(df)
tfm = TSAddMissingTimestamps(datetime_col="datetime", freq="D")
df = tfm.fit_transform(df)
display(df)
test_eq(df.shape[0], 10)

	a	b	c	datetime
0	0.211126	0.752468	0.051294	2020-01-01
2	0.394572	0.529941	0.161367	2020-01-03
3	0.571996	0.805432	0.760161	2020-01-04
5	0.361075	0.408456	0.679697	2020-01-06
6	0.056680	0.034673	0.391911	2020-01-07
8	0.259828	0.886086	0.895690	2020-01-09
9	0.297287	0.229994	0.411304	2020-01-10

	datetime	a	b	c
0	2020-01-01	0.211126	0.752468	0.051294
1	2020-01-02	NaN	NaN	NaN
2	2020-01-03	0.394572	0.529941	0.161367
3	2020-01-04	0.571996	0.805432	0.760161
4	2020-01-05	NaN	NaN	NaN
5	2020-01-06	0.361075	0.408456	0.679697
6	2020-01-07	0.056680	0.034673	0.391911
7	2020-01-08	NaN	NaN	NaN
8	2020-01-09	0.259828	0.886086	0.895690
9	2020-01-10	0.297287	0.229994	0.411304

# Test
# Filling dates between min and max dates for each value in groupby column
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)
display(date_df_with_missing_dates)
tfm = TSAddMissingTimestamps(datetime_col="date", unique_id_cols="id", freq="D")
df = tfm.fit_transform(date_df_with_missing_dates.copy())
display(df)

	date	id	feature1	feature2
0	2021-05-03	0	0.826065	0.793818
1	2021-05-05	0	0.824350	0.577807
2	2021-05-06	0	0.396992	0.866102
3	2021-05-07	0	0.156317	0.289440
4	2021-05-01	1	0.737951	0.467681
5	2021-05-03	1	0.671271	0.411190
6	2021-05-04	1	0.270644	0.427486
7	2021-05-06	1	0.992582	0.564232

	date	id	feature1	feature2
0	2021-05-03	0	0.826065	0.793818
1	2021-05-04	0	NaN	NaN
2	2021-05-05	0	0.824350	0.577807
3	2021-05-06	0	0.396992	0.866102
4	2021-05-07	0	0.156317	0.289440
5	2021-05-01	1	0.737951	0.467681
6	2021-05-02	1	NaN	NaN
7	2021-05-03	1	0.671271	0.411190
8	2021-05-04	1	0.270644	0.427486
9	2021-05-05	1	NaN	NaN
10	2021-05-06	1	0.992582	0.564232

# Test
display(date_df_with_missing_dates)
tfm = TSAddMissingTimestamps(datetime_col="date", unique_id_cols="id", freq="D", range_by_group=False)
df = tfm.fit_transform(date_df_with_missing_dates.copy())
display(df)

	date	id	feature1	feature2
0	2021-05-03	0	0.826065	0.793818
1	2021-05-05	0	0.824350	0.577807
2	2021-05-06	0	0.396992	0.866102
3	2021-05-07	0	0.156317	0.289440
4	2021-05-01	1	0.737951	0.467681
5	2021-05-03	1	0.671271	0.411190
6	2021-05-04	1	0.270644	0.427486
7	2021-05-06	1	0.992582	0.564232

	date	id	feature1	feature2
0	2021-05-01	0	NaN	NaN
1	2021-05-02	0	NaN	NaN
2	2021-05-03	0	0.826065	0.793818
3	2021-05-04	0	NaN	NaN
4	2021-05-05	0	0.824350	0.577807
5	2021-05-06	0	0.396992	0.866102
6	2021-05-07	0	0.156317	0.289440
7	2021-05-01	1	0.737951	0.467681
8	2021-05-02	1	NaN	NaN
9	2021-05-03	1	0.671271	0.411190
10	2021-05-04	1	0.270644	0.427486
11	2021-05-05	1	NaN	NaN
12	2021-05-06	1	0.992582	0.564232
13	2021-05-07	1	NaN	NaN

source

TSDropDuplicates

 TSDropDuplicates (datetime_col=None, use_index=False,
                   unique_id_cols=None, keep='last', reset_index=False)

Drop rows with duplicated values in a set of columns, optionally including a datetime column or index

	Type	Default	Details
datetime_col	NoneType	None	(str or List[str], optional): Name(s) of column(s) containing datetime values. If None, the index is used if use_index=True.
use_index	bool	False	(bool, optional): Whether to include the index in the set of columns for checking duplicates. Defaults to False.
unique_id_cols	NoneType	None	(str or List[str], optional): Name(s) of column(s) to be included in the set of columns for checking duplicates. Defaults to None.
keep	str	last	(str, optional): Which duplicated values to keep. Choose from {‘first’, ‘last’, False}. Defaults to ‘last’.
reset_index	bool	False	(bool, optional): Whether to reset the index after dropping duplicates. Ignored if use_index=False. Defaults to False.

# Test
df = pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df["datetime"] = pd.date_range("2020-01-01", periods=10)
df['user_id'] = np.sort(np.random.randint(0, 2, 10))
df = df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9]]
df.reset_index(drop=True, inplace=True)
display(df)
tfm = TSDropDuplicates(datetime_col="datetime", unique_id_cols="a")
df = tfm.fit_transform(df)
display(df)

	a	b	c	datetime	user_id
0	0.201528	0.934433	0.689088	2020-01-01	0
1	0.016200	0.818380	0.040139	2020-01-03	0
2	0.016200	0.818380	0.040139	2020-01-03	0
3	0.889913	0.991963	0.294067	2020-01-04	0
4	0.865562	0.102843	0.125955	2020-01-06	1
5	0.979152	0.673839	0.846887	2020-01-07	1
6	0.979152	0.673839	0.846887	2020-01-07	1
7	0.603150	0.682532	0.575359	2020-01-09	1
8	0.429062	0.275923	0.768581	2020-01-10	1

	a	b	c	datetime	user_id
0	0.201528	0.934433	0.689088	2020-01-01	0
2	0.016200	0.818380	0.040139	2020-01-03	0
3	0.889913	0.991963	0.294067	2020-01-04	0
4	0.865562	0.102843	0.125955	2020-01-06	1
6	0.979152	0.673839	0.846887	2020-01-07	1
7	0.603150	0.682532	0.575359	2020-01-09	1
8	0.429062	0.275923	0.768581	2020-01-10	1

source

TSFillMissing

 TSFillMissing (columns=None, unique_id_cols=None, method='ffill',
                value=0)

Fill missing values in specified columns using the specified method and/ or value.

	Type	Default	Details
columns	NoneType	None	(str or List[str], optional): Column name(s) to be transformed. If None, all columns are transformed. Defaults to None.
unique_id_cols	NoneType	None	(str or List[str], optional): Col name(s) with unique ids for each row. If None, uses all rows at once. Defaults to None .
method	str	ffill	(str, optional): The method to use for filling missing values, e.g. ‘ffill’, ‘bfill’. If None, `value` is used. Defaults to None.
value	int	0	(scalar or dict or Series, optional): The value to use for filling missing values. If None, `method` is used. Defaults to None.

# Test
df = pd.DataFrame(np.random.rand(20,3), columns=["a", "b", "c"])
df.loc[np.random.rand(20) > .5, 'a'] = np.nan
df["datetime"] = pd.date_range("2020-01-01", periods=20)
df['user_id'] = np.sort(np.random.randint(0, 2, 20))
df = df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
df.reset_index(drop=True, inplace=True)
display(df)
tfm = TSFillMissing(columns="a", method="ffill", value=0)
df = tfm.fit_transform(df)
display(df)
test_eq(df['a'].isna().sum(), 0)

	a	b	c	datetime	user_id
0	NaN	0.059943	0.130974	2020-01-01	0
1	0.734151	0.341319	0.478528	2020-01-03	0
2	0.734151	0.341319	0.478528	2020-01-03	0
3	0.928860	0.331972	0.465337	2020-01-04	0
4	NaN	0.631375	0.426398	2020-01-06	0
5	0.548145	0.174647	0.295932	2020-01-07	0
6	0.548145	0.174647	0.295932	2020-01-07	0
7	NaN	0.576881	0.563920	2020-01-09	0
8	0.500279	0.069394	0.089877	2020-01-10	0
9	0.600912	0.340959	0.917268	2020-01-11	0
10	0.406591	0.143281	0.714719	2020-01-12	0
11	NaN	0.525470	0.697833	2020-01-13	1
12	NaN	0.792191	0.676361	2020-01-14	1
13	NaN	0.945925	0.295824	2020-01-15	1
14	NaN	0.271955	0.217891	2020-01-16	1
15	NaN	0.633712	0.593461	2020-01-17	1
16	0.016243	0.728778	0.323530	2020-01-18	1
17	NaN	0.556578	0.342731	2020-01-19	1
18	0.134576	0.094419	0.831518	2020-01-20	1

	a	b	c	datetime	user_id
0	0.000000	0.059943	0.130974	2020-01-01	0
1	0.734151	0.341319	0.478528	2020-01-03	0
2	0.734151	0.341319	0.478528	2020-01-03	0
3	0.928860	0.331972	0.465337	2020-01-04	0
4	0.928860	0.631375	0.426398	2020-01-06	0
5	0.548145	0.174647	0.295932	2020-01-07	0
6	0.548145	0.174647	0.295932	2020-01-07	0
7	0.548145	0.576881	0.563920	2020-01-09	0
8	0.500279	0.069394	0.089877	2020-01-10	0
9	0.600912	0.340959	0.917268	2020-01-11	0
10	0.406591	0.143281	0.714719	2020-01-12	0
11	0.406591	0.525470	0.697833	2020-01-13	1
12	0.406591	0.792191	0.676361	2020-01-14	1
13	0.406591	0.945925	0.295824	2020-01-15	1
14	0.406591	0.271955	0.217891	2020-01-16	1
15	0.406591	0.633712	0.593461	2020-01-17	1
16	0.016243	0.728778	0.323530	2020-01-18	1
17	0.016243	0.556578	0.342731	2020-01-19	1
18	0.134576	0.094419	0.831518	2020-01-20	1

source

TSMissingnessEncoder

 TSMissingnessEncoder (columns=None)

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

setting and getting parameters used by GridSearchCV and friends;
textual and HTML representation displayed in terminals and IDEs;
estimator serialization;
parameters validation;
data validation;
feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

# Test
df = pd.DataFrame(np.random.rand(20,3), columns=["a", "b", "c"])
df.loc[np.random.rand(20) > .5, 'a'] = np.nan
df["datetime"] = pd.date_range("2020-01-01", periods=20)
df['user_id'] = np.sort(np.random.randint(0, 2, 20))
df = df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
df.reset_index(drop=True, inplace=True)
display(df)
tfm = TSMissingnessEncoder(columns="a")
df = tfm.fit_transform(df)
display(df)

	a	b	c	datetime	user_id
0	0.873619	0.995569	0.582714	2020-01-01	0
1	0.402704	0.672507	0.682192	2020-01-03	0
2	0.402704	0.672507	0.682192	2020-01-03	0
3	NaN	0.133210	0.632396	2020-01-04	0
4	0.700611	0.753472	0.872859	2020-01-06	0
5	NaN	0.730249	0.619173	2020-01-07	0
6	NaN	0.730249	0.619173	2020-01-07	0
7	NaN	0.617106	0.849959	2020-01-09	0
8	0.196246	0.125550	0.963480	2020-01-10	1
9	0.108045	0.478491	0.585564	2020-01-11	1
10	NaN	0.086032	0.057027	2020-01-12	1
11	0.105483	0.585588	0.544345	2020-01-13	1
12	0.233741	0.637774	0.820068	2020-01-14	1
13	NaN	0.498130	0.689310	2020-01-15	1
14	NaN	0.307771	0.613638	2020-01-16	1
15	0.897935	0.809924	0.583130	2020-01-17	1
16	0.730222	0.364822	0.640966	2020-01-18	1
17	0.466182	0.189936	0.701738	2020-01-19	1
18	NaN	0.358622	0.911339	2020-01-20	1

	a	b	c	datetime	user_id	a_missing
0	0.873619	0.995569	0.582714	2020-01-01	0	0
1	0.402704	0.672507	0.682192	2020-01-03	0	0
2	0.402704	0.672507	0.682192	2020-01-03	0	0
3	NaN	0.133210	0.632396	2020-01-04	0	1
4	0.700611	0.753472	0.872859	2020-01-06	0	0
5	NaN	0.730249	0.619173	2020-01-07	0	1
6	NaN	0.730249	0.619173	2020-01-07	0	1
7	NaN	0.617106	0.849959	2020-01-09	0	1
8	0.196246	0.125550	0.963480	2020-01-10	1	0
9	0.108045	0.478491	0.585564	2020-01-11	1	0
10	NaN	0.086032	0.057027	2020-01-12	1	1
11	0.105483	0.585588	0.544345	2020-01-13	1	0
12	0.233741	0.637774	0.820068	2020-01-14	1	0
13	NaN	0.498130	0.689310	2020-01-15	1	1
14	NaN	0.307771	0.613638	2020-01-16	1	1
15	0.897935	0.809924	0.583130	2020-01-17	1	0
16	0.730222	0.364822	0.640966	2020-01-18	1	0
17	0.466182	0.189936	0.701738	2020-01-19	1	0
18	NaN	0.358622	0.911339	2020-01-20	1	1

With these sklearn preprocessing API transforms it’s possible to build data preprocessing pipelines like this one:

from sklearn.pipeline import Pipeline

cont_cols = ['cont_0', 'cont_1', 'cont_2', 'cont_3', 'cont_4', 'cont_5']
pipe = Pipeline([
    ('shrinker', TSShrinkDataFrame()), 
    ('drop_duplicates', TSDropDuplicates('date', unique_id_cols='user_id')),
    ('add_mts', TSAddMissingTimestamps(datetime_col='date', unique_id_cols='user_id', freq='D', range_by_group=False)),
    ('onehot_encoder', TSOneHotEncoder(['cat_0'])),
    ('cat_encoder', TSCategoricalEncoder(['user_id', 'cat_1'])),
    ('steps_since_start', TSStepsSinceStart('date', datetime_unit='D', start_datetime='2017-01-01'), dtype=np.int32),
    ('missing_encoder', TSMissingnessEncoder(['cont_1'])),
    ('fill_missing', TSFillMissing(cont_cols, unique_id_cols='user_id', value=0)),
    ], 
    verbose=True)
df = pipe.fit_transform(df)

y transforms

source

Preprocessor

 Preprocessor (preprocessor, **kwargs)

Initialize self. See help(type(self)) for accurate signature.

# Standardize
from tsai.data.validation import TimeSplitter

y = random_shuffle(np.random.randn(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(StandardScaler)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()

# RobustScaler
y = random_shuffle(np.random.randn(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(RobustScaler)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()

# Normalize
y = random_shuffle(np.random.rand(1000) * 3 + .5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(Normalizer)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()

# BoxCox
y = random_shuffle(np.random.rand(1000) * 10 + 5)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(BoxCox)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()

# YeoJohnshon
y = random_shuffle(np.random.randn(1000) * 10 + 5)
y = np.random.beta(.5, .5, size=1000)
splits = TimeSplitter()(y)
preprocessor = Preprocessor(YeoJohnshon)
preprocessor.fit(y[splits[0]])
y_tfm = preprocessor.transform(y)
test_close(preprocessor.inverse_transform(y_tfm), y)
plt.hist(y, 50, label='ori',)
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()

# QuantileTransformer
y = - np.random.beta(1, .5, 10000) * 10
splits = TimeSplitter()(y)
preprocessor = Preprocessor(Quantile)
preprocessor.fit(y[splits[0]])
plt.hist(y, 50, label='ori',)
y_tfm = preprocessor.transform(y)
plt.legend(loc='best')
plt.show()
plt.hist(y_tfm, 50, label='tfm')
plt.legend(loc='best')
plt.show()
test_close(preprocessor.inverse_transform(y_tfm), y, 1e-1)

source

ReLabeler

 ReLabeler (cm)

Changes the labels in a dataset based on a dictionary (class mapping) Args: cm = class mapping dictionary

vals = {0:'a', 1:'b', 2:'c', 3:'d', 4:'e'}
y = np.array([vals[i] for i in np.random.randint(0, 5, 20)])
labeler = ReLabeler(dict(a='x', b='x', c='y', d='z', e='z'))
y_new = labeler(y)
test_eq(y.shape, y_new.shape)
y, y_new

(array(['d', 'd', 'a', 'd', 'b', 'e', 'a', 'd', 'b', 'c', 'b', 'e', 'b',
        'b', 'a', 'e', 'd', 'e', 'c', 'e'], dtype='<U1'),
 array(['z', 'z', 'x', 'z', 'x', 'z', 'x', 'z', 'x', 'y', 'x', 'z', 'x',
        'x', 'x', 'z', 'z', 'z', 'y', 'z'], dtype='<U1'))