from tsai.data.external import get_UCR_data
Data preprocessing
Functions used to preprocess time series (both X and y).
= 'NATOPS'
dsid = get_UCR_data(dsid, return_split=False)
X, y, splits = [None, Categorize()]
tfms = TSDatasets(X, y, tfms=tfms, splits=splits) dsets
ToNumpyCategory
ToNumpyCategory (**kwargs)
Categorize a numpy batch
= ToNumpyCategory()
t = t(y)
y_cat 10] y_cat[:
array([3, 2, 2, 3, 2, 4, 0, 5, 2, 1])
test_eq(t.decode(tensor(y_cat)), y) test_eq(t.decode(np.array(y_cat)), y)
OneHot
OneHot (n_classes=None, **kwargs)
One-hot encode/ decode a batch
= OneHot()
oh_encoder = ToNumpyCategory()(y)
y_cat = oh_encoder(y_cat)
oht 10] oht[:
array([[0., 0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 1., 0.],
[1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 1.],
[0., 0., 1., 0., 0., 0.],
[0., 1., 0., 0., 0., 0.]])
= 10
n_classes = 100
n_samples
= torch.randint(0, n_classes, (n_samples,))
t = OneHot()
oh_encoder = oh_encoder(t)
oht
test_eq(oht.shape, (n_samples, n_classes))=-1), t)
test_eq(torch.argmax(oht, dim test_eq(oh_encoder.decode(oht), t)
= 10
n_classes = 100
n_samples
= np.random.randint(0, n_classes, (n_samples,))
a = OneHot()
oh_encoder = oh_encoder(a)
oha
test_eq(oha.shape, (n_samples, n_classes))=-1), a)
test_eq(np.argmax(oha, axis test_eq(oh_encoder.decode(oha), a)
TSNan2Value
TSNan2Value (value=0, median=False, by_sample_and_var=True, sel_vars=None)
Replaces any nan values by a predefined value or median
= TSTensor(torch.randn(16, 10, 100))
o 0,0] = float('nan')
o[> .9] = float('nan')
o[o 0,1,5,8,14,15], :, -20:] = float('nan')
o[[= torch.isnan(o).sum()
nan_vals1 = Pipeline(TSNan2Value(), split_idx=0)(o.clone())
o2 = Pipeline(TSNan2Value(median=True, by_sample_and_var=True), split_idx=0)(o.clone())
o3 = Pipeline(TSNan2Value(median=True, by_sample_and_var=False), split_idx=0)(o.clone())
o4 = torch.isnan(o2).sum()
nan_vals2 = torch.isnan(o3).sum()
nan_vals3 = torch.isnan(o4).sum()
nan_vals4 0)
test_ne(nan_vals1, 0)
test_eq(nan_vals2, 0)
test_eq(nan_vals3, 0) test_eq(nan_vals4,
= TSTensor(torch.randn(16, 10, 100))
o > .9] = float('nan')
o[o = TSNan2Value(median=True, sel_vars=[0,1,2,3,4])(o)
o 0,1,2,3,4]]).sum().item(), 0) test_eq(torch.isnan(o[:, [
TSStandardize
TSStandardize (mean=None, std=None, by_sample=False, by_var=False, by_step=False, exc_vars=None, eps=1e-08, use_single_batch=True, verbose=False, **kwargs)
*Standardizes batch of type TSTensor
Args: - mean: you can pass a precalculated mean value as a torch tensor which is the one that will be used, or leave as None, in which case it will be estimated using a batch. - std: you can pass a precalculated std value as a torch tensor which is the one that will be used, or leave as None, in which case it will be estimated using a batch. If both mean and std values are passed when instantiating TSStandardize, the rest of arguments won’t be used. - by_sample: if True, it will calculate mean and std for each individual sample. Otherwise based on the entire batch. - by_var: * False: mean and std will be the same for all variables. * True: a mean and std will be be different for each variable. * a list of ints: (like [0,1,3]) a different mean and std will be set for each variable on the list. Variables not included in the list won’t be standardized. * a list that contains a list/lists: (like[0, [1,3]]) a different mean and std will be set for each element of the list. If multiple elements are included in a list, the same mean and std will be set for those variable in the sublist/s. (in the example a mean and std is determined for variable 0, and another one for variables 1 & 3 - the same one). Variables not included in the list won’t be standardized. - by_step: if False, it will standardize values for each time step. - exc_vars: list of variables that won’t be standardized. - eps: it avoids dividing by 0 - use_single_batch: if True a single training batch will be used to calculate mean & std. Else the entire training set will be used.*
=[TSStandardize(by_sample=True, by_var=False, verbose=True)]
batch_tfms= TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, batch_tfms=batch_tfms)
dls = next(iter(dls.train))
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1) test_close(xb.std(),
= [0, 2, 6, 8, 12]
exc_vars =[TSStandardize(by_var=True, exc_vars=exc_vars)]
batch_tfms= TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, batch_tfms=batch_tfms)
dls = next(iter(dls.train))
xb, yb len(dls.train.after_batch.fs[0].mean.flatten()), 24)
test_eq(len(dls.train.after_batch.fs[0].std.flatten()), 24)
test_eq(0].mean.flatten()[exc_vars].cpu(), torch.zeros(len(exc_vars)))
test_eq(dls.train.after_batch.fs[0].std.flatten()[exc_vars].cpu(), torch.ones(len(exc_vars)))
test_eq(dls.train.after_batch.fs[print(dls.train.after_batch.fs[0].mean.flatten().data)
print(dls.train.after_batch.fs[0].std.flatten().data)
tensor([ 0.0000, -1.3398, 0.0000, 0.9952, -0.8438, -0.4308, 0.0000, -0.6077,
0.0000, 0.7781, -0.4869, -0.0969, 0.0000, -1.0620, -0.6171, 0.9253,
-0.7023, -0.3077, -0.5600, -1.1922, -0.7503, 0.9491, -0.7744, -0.4356])
tensor([1.0000, 0.8743, 1.0000, 0.7510, 1.1557, 0.5370, 1.0000, 0.2666, 1.0000,
0.2380, 0.4047, 0.3274, 1.0000, 0.6371, 0.2798, 0.5287, 0.8642, 0.4297,
0.5842, 0.7581, 0.3162, 0.6739, 1.0118, 0.4958])
from tsai.data.validation import TimeSplitter
= np.random.rand(100, 5, 10)
X_nan = random_choice(len(X_nan), int(len(X_nan)*.5), False)
idxs 0] = float('nan')
X_nan[idxs, = random_choice(len(X_nan), int(len(X_nan)*.5), False)
idxs 1, -10:] = float('nan')
X_nan[idxs, = TSStandardize(by_var=True)
batch_tfms = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
dls 0].mean).sum(), 0)
test_eq(torch.isnan(dls.after_batch[0].std).sum(), 0)
test_eq(torch.isnan(dls.after_batch[= first(dls.train)[0]
xb sum(), 0)
test_ne(torch.isnan(xb).sum(), torch.isnan(xb).numel())
test_ne(torch.isnan(xb).= [TSStandardize(by_var=True), Nan2Value()]
batch_tfms = get_ts_dls(X_nan, batch_tfms=batch_tfms, splits=TimeSplitter(show_plot=False)(range_of(X_nan)))
dls = first(dls.train)[0]
xb sum(), 0) test_eq(torch.isnan(xb).
=[TSStandardize(by_sample=True, by_var=False, verbose=False)]
batch_tfms= TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
dls = next(iter(dls.train))
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1)
test_close(xb.std(), = next(iter(dls.valid))
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1) test_close(xb.std(),
= [None, TSClassification()]
tfms = TSStandardize(by_sample=True)
batch_tfms = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=True)
dls = dls.train.one_batch()
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1)
test_close(xb.std(), = dls.valid.one_batch()
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1) test_close(xb.std(),
= [None, TSClassification()]
tfms = TSStandardize(by_sample=True, by_var=False, verbose=False)
batch_tfms = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128], inplace=False)
dls = dls.train.one_batch()
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1)
test_close(xb.std(), = dls.valid.one_batch()
xb, yb 0, eps=1e-1)
test_close(xb.mean(), 1, eps=1e-1) test_close(xb.std(),
TSNormalize
TSNormalize (min=None, max=None, range=(-1, 1), by_sample=False, by_var=False, by_step=False, clip_values=True, use_single_batch=True, verbose=False, **kwargs)
Normalizes batch of type TSTensor
mul_max’]
*Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.*
mul_min’]
*Built-in mutable sequence.
If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.*
= [TSNormalize()]
batch_tfms = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
dls = next(iter(dls.train))
xb, yb assert xb.max() <= 1
assert xb.min() >= -1
=[TSNormalize(by_sample=True, by_var=False, verbose=False)]
batch_tfms= TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
dls = next(iter(dls.train))
xb, yb assert xb.max() <= 1
assert xb.min() >= -1
= [TSNormalize(by_var=[0, [1, 2]], use_single_batch=False, clip_values=False, verbose=False)]
batch_tfms = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
dls = next(iter(dls.train))
xb, yb assert xb[:, [0, 1, 2]].max() <= 1
assert xb[:, [0, 1, 2]].min() >= -1
TSStandardizeTuple
TSStandardizeTuple (x_mean, x_std, y_mean=None, y_std=None, eps=1e-05)
Standardizes X (and y if provided)
= TSTensor([1., 2, 3]), TSTensor([4., 5, 6])
a, b = a.mean(), b.std()
mean, std = TSStandardizeTuple(mean, std)
tuple_batch_tfm = tuple_batch_tfm((a, b))
a_tfmd, b_tfmd
test_ne(a, a_tfmd) test_ne(b, b_tfmd)
TSCatEncode
TSCatEncode (a, sel_var)
Encodes a variable based on a categorical array
# static input
= np.random.randint(10, 20, 512)[:, None, None].repeat(10, 1).repeat(28, 2)
a = TSTensor(torch.randint(0, 30, (512,), device='cpu').unsqueeze(-1).unsqueeze(-1).repeat(1, 10, 28))
b = TSCatEncode(a, sel_var=0)(b)
output 0 <= output[:, 0].min() <= len(np.unique(a)), True)
test_eq(0 <= output[:, 0].max() <= len(np.unique(a)), True)
test_eq(0], output[:, 0, 0][:, None].repeat(1, 28))
test_eq(output[:, 0].data output[:,
tensor([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[4, 4, 4, ..., 4, 4, 4],
[4, 4, 4, ..., 4, 4, 4],
[0, 0, 0, ..., 0, 0, 0]])
# non-static input
= np.random.randint(10, 20, 512)[:, None, None].repeat(10, 1).repeat(28, 2)
a = TSTensor(torch.randint(0, 30, (512, 10, 28), device='cpu'))
b = TSCatEncode(a, sel_var=0)(b)
output 0 <= output[:, 0].min() <= len(np.unique(a)), True)
test_eq(0 <= output[:, 0].max() <= len(np.unique(a)), True)
test_eq(0], output[:, 0, 0][:, None].repeat(1, 28))
test_ne(output[:, 0].data output[:,
tensor([[10, 0, 0, ..., 4, 0, 0],
[10, 0, 0, ..., 0, 0, 0],
[ 0, 2, 0, ..., 0, 10, 6],
...,
[ 1, 0, 9, ..., 0, 0, 0],
[ 0, 0, 5, ..., 0, 0, 0],
[ 0, 0, 0, ..., 0, 0, 5]])
TSDropFeatByKey
TSDropFeatByKey (key_var, p, sel_vars, sel_steps=None, **kwargs)
Randomly drops selected features at selected steps based with a given probability per feature, step and a key variable
Type | Default | Details | |
---|---|---|---|
key_var | int representing the variable that contains the key information | ||
p | array of shape (n_keys, n_features, n_steps) representing the probabilities of dropping a feature at a given step for a given key | ||
sel_vars | int or slice or list of ints or array of ints representing the variables to drop | ||
sel_steps | NoneType | None | int or slice or list of ints or array of ints representing the steps to drop |
kwargs |
= 4
n_devices = 0
key_var
for sel_vars in [1, [1], [1,3,5], slice(3, 5)]:
for sel_steps in [None, -1, 27, [27], [25, 26], slice(10, 20)]:
= TSTensor(torch.rand(512, 10, 28))
o = torch.randint(0, n_devices, (512, 28))
o[:, key_var] = 1 if isinstance(sel_vars, Integral) else len(sel_vars) if isinstance(sel_vars, list) else sel_vars.stop - sel_vars.start
n_vars = o.shape[-1] if sel_steps is None else 1 if isinstance(sel_steps, Integral) else \
n_steps len(sel_steps) if isinstance(sel_steps, list) else sel_steps.stop - sel_steps.start
= torch.rand(n_devices, n_vars, n_steps) * .5 + .5
p = TSDropFeatByKey(key_var, p, sel_vars, sel_steps)(o)
output assert torch.isnan(output).sum((0, 2))[sel_vars].sum() > 0
assert torch.isnan(output).sum((0, 2))[~np.array(np.arange(o.shape[1])[sel_vars])].sum() == 0
TSClipOutliers
TSClipOutliers (min=None, max=None, by_sample=False, by_var=False, use_single_batch=False, verbose=False, **kwargs)
Clip outliers batch of type TSTensor
based on the IQR
=[TSClipOutliers(-1, 1, verbose=True)]
batch_tfms= TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=128, num_workers=0, after_batch=batch_tfms)
dls = next(iter(dls.train))
xb, yb assert xb.max() <= 1
assert xb.min() >= -1
min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1)
test_close(xb.= next(iter(dls.valid))
xb, yb min(), -1, eps=1e-1)
test_close(xb.max(), 1, eps=1e-1) test_close(xb.
TSClipOutliers min=-1, max=1
TSClip
TSClip (min=-6, max=6, **kwargs)
Clip batch of type TSTensor
= TSTensor(torch.randn(10, 20, 100)*10)
t max().item(), 6)
test_le(TSClip()(t).min().item(), -6) test_ge(TSClip()(t).
TSSelfMissingness
TSSelfMissingness (sel_vars=None, **kwargs)
Applies missingness from samples in a batch to random samples in the batch for selected variables
= TSTensor(torch.randn(10, 20, 100))
t >.8] = np.nan
t[t= TSSelfMissingness()(t.clone())
t2 = TSSelfMissingness(sel_vars=[0,3,5,7])(t.clone())
t3 assert (torch.isnan(t).sum() < torch.isnan(t2).sum()) and (torch.isnan(t2).sum() > torch.isnan(t3).sum())
TSRobustScale
TSRobustScale (median=None, iqr=None, quantile_range=(25.0, 75.0), use_single_batch=True, exc_vars=None, eps=1e-08, verbose=False, **kwargs)
This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)
= TSRobustScale(verbose=True, use_single_batch=False)
batch_tfms = TSDataLoaders.from_dsets(dsets.train, dsets.valid, batch_tfms=batch_tfms, num_workers=0)
dls = next(iter(dls.train))
xb, yb min() xb.
TSRobustScale median=torch.Size([1, 24, 1]) iqr=torch.Size([1, 24, 1])
TSTensor([-2.3502116203308105], device=cpu, dtype=torch.float32)
= [0, 2, 6, 8, 12]
exc_vars = TSRobustScale(use_single_batch=False, exc_vars=exc_vars)
batch_tfms = TSDataLoaders.from_dsets(dsets.train, dsets.valid, batch_tfms=batch_tfms, num_workers=0)
dls = next(iter(dls.train))
xb, yb len(dls.train.after_batch.fs[0].median.flatten()), 24)
test_eq(len(dls.train.after_batch.fs[0].iqr.flatten()), 24)
test_eq(0].median.flatten()[exc_vars].cpu(), torch.zeros(len(exc_vars)))
test_eq(dls.train.after_batch.fs[0].iqr.flatten()[exc_vars].cpu(), torch.ones(len(exc_vars)))
test_eq(dls.train.after_batch.fs[print(dls.train.after_batch.fs[0].median.flatten().data)
print(dls.train.after_batch.fs[0].iqr.flatten().data)
tensor([ 0.0000, -1.7305, 0.0000, 0.7365, -1.2736, -0.5528, 0.0000, -0.7074,
0.0000, 0.7087, -0.7014, -0.1120, 0.0000, -1.3332, -0.5958, 0.7563,
-1.0129, -0.3985, -0.5186, -1.5125, -0.7353, 0.7326, -1.1495, -0.5359])
tensor([1.0000, 4.2788, 1.0000, 4.8008, 8.0682, 2.2777, 1.0000, 0.6955, 1.0000,
1.4875, 2.6386, 1.4756, 1.0000, 2.9811, 1.2507, 3.2291, 5.9906, 1.9098,
1.3428, 3.6368, 1.3689, 4.4213, 6.9907, 2.1939])
TSGaussianStandardize
TSGaussianStandardize (E_mean:np.ndarray, S_mean:np.ndarray, E_std:np.ndarray, S_std:np.ndarray, eps=1e-08, split_idx=0, **kwargs)
Scales each batch using modeled mean and std based on UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958
Type | Default | Details | |
---|---|---|---|
E_mean | ndarray | Mean expected value | |
S_mean | ndarray | Uncertainty (standard deviation) of the mean | |
E_std | ndarray | Standard deviation expected value | |
S_std | ndarray | Uncertainty (standard deviation) of the standard deviation | |
eps | float | 1e-08 | (epsilon) small amount added to standard deviation to avoid deviding by zero |
split_idx | int | 0 | Flag to indicate to which set is this transofrm applied. 0: training, 1:validation, None:both |
kwargs |
get_random_stats
get_random_stats (E_mean, S_mean, E_std, S_std)
get_stats_with_uncertainty
get_stats_with_uncertainty (o, sel_vars=None, sel_vars_zero_mean_unit_var=False, bs=64, n_trials=None, axis=(0, 2))
= np.random.rand(1000, 2, 50)
arr = get_stats_with_uncertainty(arr, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
E_mean, S_mean, E_std, S_std = get_random_stats(E_mean, S_mean, E_std, S_std)
new_mean, new_std = get_random_stats(E_mean, S_mean, E_std, S_std)
new_mean2, new_std2
test_ne(new_mean, new_mean2)
test_ne(new_std, new_std2)1, 2, 1))
test_eq(new_mean.shape, (1, 2, 1))
test_eq(new_std.shape, ( new_mean, new_std
(array([[[0.49649504],
[0.49636062]]]),
array([[[0.28626438],
[0.28665599]]]))
TSGaussianStandardize can be used jointly with TSStandardized in the following way:
= get_UCR_data('LSST', split_data=False)
X, y, splits = [None, TSClassification()]
tfms = get_stats_with_uncertainty(X, sel_vars=None, bs=64, n_trials=None, axis=(0,2))
E_mean, S_mean, E_std, S_std = [TSGaussianStandardize(E_mean, S_mean, E_std, S_std, split_idx=0), TSStandardize(E_mean, S_mean, split_idx=1)]
batch_tfms = get_ts_dls(X, y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[32, 64])
dls = ts_learner(dls, InceptionTimePlus, metrics=accuracy, cbs=[ShowGraph()])
learn 1, 1e-2) learn.fit_one_cycle(
In this way the train batches are scaled based on mean and standard deviation distributions while the valid batches are scaled with a fixed mean and standard deviation values.
The intent is to improve out-of-distribution performance. This method is inspired by UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958.
TSDiff
TSDiff (lag=1, pad=True, **kwargs)
Differences batch of type TSTensor
= TSTensor(torch.arange(24).reshape(2,3,4))
t 1:].float().mean(), 1)
test_eq(TSDiff()(t)[..., =2, pad=False)(t).float().mean(), 2) test_eq(TSDiff(lag
TSLog
TSLog (ex=None, **kwargs)
Log transforms batch of type TSTensor
+ 1. Accepts positive and negative numbers
= TSTensor(torch.rand(2,3,4)) * 2 - 1
t = TSLog()
tfm = tfm(t)
enc_t
test_ne(enc_t, t) test_close(tfm.decodes(enc_t).data, t.data)
TSCyclicalPosition
TSCyclicalPosition (cyclical_var=None, magnitude=None, drop_var=False, **kwargs)
Concatenates the position along the sequence as 2 additional variables (sine and cosine)
Type | Default | Details | |
---|---|---|---|
cyclical_var | NoneType | None | Optional variable to indicate the steps withing the cycle (ie minute of the day) |
magnitude | NoneType | None | Added for compatibility. It’s not used. |
drop_var | bool | False | Flag to indicate if the cyclical var is removed |
kwargs |
= 1,3,100
bs, c_in, seq_len = TSTensor(torch.rand(bs, c_in, seq_len))
t = TSCyclicalPosition()(t)
enc_t
test_ne(enc_t, t)assert t.shape[1] == enc_t.shape[1] - 2
0, -2:].cpu().numpy().T)
plt.plot(enc_t[ plt.show()
= 1,3,100
bs, c_in, seq_len = torch.rand(bs, c_in, seq_len)
t1 = torch.arange(seq_len)
t2 = torch.cat([t2[35:], t2[:35]]).reshape(1, 1, -1)
t2 = TSTensor(torch.cat([t1, t2], 1))
t = torch.rand_like(t) > .8
mask = np.nan
t[mask] = TSCyclicalPosition(3)(t)
enc_t
test_ne(enc_t, t)assert t.shape[1] == enc_t.shape[1] - 2
0, -2:].cpu().numpy().T)
plt.plot(enc_t[ plt.show()
TSLinearPosition
TSLinearPosition (linear_var:int=None, var_range:tuple=None, magnitude=None, drop_var:bool=False, lin_range:tuple=(-1, 1), **kwargs)
Concatenates the position along the sequence as 1 additional variable
Type | Default | Details | |
---|---|---|---|
linear_var | int | None | Optional variable to indicate the steps withing the cycle (ie minute of the day) |
var_range | tuple | None | Optional range indicating min and max values of the linear variable |
magnitude | NoneType | None | Added for compatibility. It’s not used. |
drop_var | bool | False | Flag to indicate if the cyclical var is removed |
lin_range | tuple | (-1, 1) | |
kwargs |
= 1,3,100
bs, c_in, seq_len = TSTensor(torch.rand(bs, c_in, seq_len))
t = TSLinearPosition()(t)
enc_t
test_ne(enc_t, t)assert t.shape[1] == enc_t.shape[1] - 1
0, -1].cpu().numpy().T)
plt.plot(enc_t[ plt.show()
= torch.arange(100)
t = torch.cat([t[30:], t[:30]]).reshape(1, 1, -1)
t1 = torch.cat([t[52:], t[:52]]).reshape(1, 1, -1)
t2 = torch.cat([t1, t2]).float()
t = torch.rand_like(t) > .8
mask = np.nan
t[mask] = TSTensor(t)
t = TSLinearPosition(linear_var=0, var_range=(0, 100), drop_var=True)(t)
enc_t
test_ne(enc_t, t)assert t.shape[1] == enc_t.shape[1]
0, -1].cpu().numpy().T)
plt.plot(enc_t[ plt.show()
TSMissingness
TSMissingness (sel_vars=None, feature_idxs=None, magnitude=None, **kwargs)
Concatenates data missingness for selected features along the sequence as additional variables
= 1,3,100
bs, c_in, seq_len = TSTensor(torch.rand(bs, c_in, seq_len))
t >.5] = np.nan
t[t= TSMissingness(sel_vars=[0,2])(t)
enc_t 1], 5)
test_eq(enc_t.shape[3:], torch.isnan(t[:, [0,2]]).float()) test_eq(enc_t[:,
TSPositionGaps
TSPositionGaps (sel_vars=None, feature_idxs=None, magnitude=None, forward=True, backward=False, nearest=False, normalize=True, **kwargs)
Concatenates gaps for selected features along the sequence as additional variables
= 1,3,8
bs, c_in, seq_len = TSTensor(torch.rand(bs, c_in, seq_len))
t >.5] = np.nan
t[t= TSPositionGaps(sel_vars=[0,2], forward=True, backward=True, nearest=True, normalize=False)(t)
enc_t 1], 9)
test_eq(enc_t.shape[ enc_t.data
tensor([[[0.2875, 0.0553, nan, nan, 0.1478, 0.1234, 0.0835, 0.1465],
[ nan, nan, 0.3967, nan, 0.0654, nan, 0.2289, 0.1094],
[0.3820, 0.1613, 0.4825, 0.1379, nan, nan, 0.3000, 0.4673],
[1.0000, 1.0000, 1.0000, 2.0000, 3.0000, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 2.0000, 3.0000, 1.0000],
[1.0000, 3.0000, 2.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, 3.0000, 2.0000, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]]])
TSRollingMean
TSRollingMean (sel_vars=None, feature_idxs=None, magnitude=None, window=2, replace=False, **kwargs)
*Calculates the rolling mean for all/ selected features alongside the sequence
It replaces the original values or adds additional variables (default) If nan values are found, they will be filled forward and backward*
= 1,3,8
bs, c_in, seq_len = TSTensor(torch.rand(bs, c_in, seq_len))
t > .6] = np.nan
t[t print(t.data)
= TSRollingMean(sel_vars=[0,2], window=3)(t)
enc_t 1], 5)
test_eq(enc_t.shape[print(enc_t.data)
= TSRollingMean(window=3, replace=True)(t)
enc_t 1], 3)
test_eq(enc_t.shape[print(enc_t.data)
tensor([[[ nan, 0.3836, nan, nan, 0.0237, 0.4363, nan, 0.1834],
[0.2749, 0.5018, nan, 0.4008, 0.2797, 0.4010, 0.4323, 0.3692],
[0.4013, nan, 0.1272, 0.2202, 0.4324, 0.3293, 0.5350, 0.3919]]])
tensor([[[0.3836, 0.3836, 0.3836, 0.3836, 0.0237, 0.4363, 0.4363, 0.1834],
[0.2749, 0.5018, nan, 0.4008, 0.2797, 0.4010, 0.4323, 0.3692],
[0.4013, 0.4013, 0.1272, 0.2202, 0.4324, 0.3293, 0.5350, 0.3919],
[0.3836, 0.3836, 0.3836, 0.3836, 0.2636, 0.2812, 0.2988, 0.3520],
[0.4013, 0.4013, 0.3099, 0.2496, 0.2599, 0.3273, 0.4322, 0.4187]]])
tensor([[[0.3836, 0.3836, 0.3836, 0.3836, 0.2636, 0.2812, 0.2988, 0.3520],
[0.2749, 0.3883, 0.4261, 0.4681, 0.3941, 0.3605, 0.3710, 0.4008],
[0.4013, 0.4013, 0.3099, 0.2496, 0.2599, 0.3273, 0.4322, 0.4187]]])
TSLogReturn
TSLogReturn (lag=1, pad=True, **kwargs)
Calculates log-return of batch of type TSTensor
. For positive values only
= TSTensor([1,2,4,8,16,32,64,128,256]).float()
t =False)(t).std(), 0) test_eq(TSLogReturn(pad
TSAdd
TSAdd (add, **kwargs)
Add a defined amount to each batch of type TSTensor
.
= TSTensor([1,2,3]).float()
t 1)(t), TSTensor([2,3,4]).float()) test_eq(TSAdd(
TSClipByVar
TSClipByVar (var_min_max, **kwargs)
*Clip batch of type TSTensor
by variable
Args: var_min_max: list of tuples containing variable index, min value (or None) and max value (or None)*
= TSTensor(torch.rand(16, 3, 10) * tensor([1,10,100]).reshape(1,-1,1))
t = t.max(0).values.max(-1).values.data
max_values = TSClipByVar([(1,None,5), (2,10,50)])(t).max(0).values.max(-1).values.data
max_values2 1], 5)
test_le(max_values2[2], 10)
test_ge(max_values2[2], 50) test_le(max_values2[
TSDropVars
TSDropVars (drop_vars, **kwargs)
Drops selected variable from the input
= TSTensor(torch.arange(24).reshape(2, 3, 4))
t = TSDropVars(2)(t)
enc_t
test_ne(t, enc_t) enc_t.data
tensor([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[12, 13, 14, 15],
[16, 17, 18, 19]]])
TSOneHotEncode
TSOneHotEncode (sel_var:int, unique_labels:list, add_na:bool=False, drop_var:bool=True, magnitude=None, **kwargs)
Delegates (__call__
,decode
,setup
) to (encodes
,decodes
,setups
) if split_idx
matches
Type | Default | Details | |
---|---|---|---|
sel_var | int | Variable that is one-hot encoded | |
unique_labels | list | List containing all labels (excluding nan values) | |
add_na | bool | False | Flag to indicate if values not included in vocab should be set as 0 |
drop_var | bool | True | Flag to indicate if the selected var is removed |
magnitude | NoneType | None | Added for compatibility. It’s not used. |
kwargs |
= 2
bs = 5
seq_len = torch.rand(bs, 1, seq_len)
t_cont = torch.randint(0, 3, t_cont.shape)
t_cat = TSTensor(torch.cat([t_cat, t_cont], 1))
t t_cat
tensor([[[0, 2, 1, 0, 2]],
[[0, 0, 1, 1, 2]]])
= TSOneHotEncode(0, [0, 1, 2])
tfm = tfm(t)[:, -3:].data
output -3:], 1)[:, None])
test_eq(t_cat, torch.argmax(tfm(t)[:, -3:].data tfm(t)[:,
tensor([[[1., 0., 0., 1., 0.],
[0., 0., 1., 0., 0.],
[0., 1., 0., 0., 1.]],
[[1., 1., 0., 0., 0.],
[0., 0., 1., 1., 0.],
[0., 0., 0., 0., 1.]]])
= 2
bs = 5
seq_len = torch.rand(bs, 1, seq_len)
t_cont = torch.tensor([[10., 5., 11., np.nan, 12.], [ 5., 12., 10., np.nan, 11.]])[:, None]
t_cat = TSTensor(torch.cat([t_cat, t_cont], 1))
t t_cat
tensor([[[10., 5., 11., nan, 12.]],
[[ 5., 12., 10., nan, 11.]]])
= TSOneHotEncode(0, [10, 11, 12], drop_var=False)
tfm = ~torch.isnan(t[:, 0])
mask 0][mask], t[:, 0][mask])
test_eq(tfm(t)[:, -3:].data tfm(t)[:,
tensor([[[1., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 0., 1.]],
[[0., 0., 1., 0., 0.],
[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.]]])
= torch.randint(3, 7, (2, 1, 10))
t1 = torch.rand(2, 1, 10)
t2 = TSTensor(torch.cat([t1, t2], 1))
t = TSOneHotEncode(0, [3, 4, 5], add_na=True, drop_var=True)(t)
output > 5).float(), output.data[:, [1]])
test_eq((t1 == 3).float(), output.data[:, [2]])
test_eq((t1 == 4).float(), output.data[:, [3]])
test_eq((t1 == 5).float(), output.data[:, [4]])
test_eq((t1 0], 5, t.shape[-1])) test_eq(output.shape, (t.shape[
TSPosition
TSPosition (steps:list, magnitude=None, **kwargs)
Delegates (__call__
,decode
,setup
) to (encodes
,decodes
,setups
) if split_idx
matches
Type | Default | Details | |
---|---|---|---|
steps | list | List containing the steps passed as an additional variable. Theu should be normalized. | |
magnitude | NoneType | None | Added for compatibility. It’s not used. |
kwargs |
= TSTensor(torch.rand(2, 1, 10)).float()
t = np.linspace(-1, 1, 10).astype('float64')
a TSPosition(a)(t).data.dtype, t.dtype
(torch.float32, torch.float32)
PatchEncoder
PatchEncoder (patch_len:int, patch_stride:int=None, pad_at_start:bool=True, value:float=0.0, seq_len:int=None, merge_dims:bool=True, reduction:str='none', reduction_dim:int=-1, swap_dims:tuple=None)
Creates a sequence of patches from a 3d input tensor.
Type | Default | Details | |
---|---|---|---|
patch_len | int | Number of time steps in each patch. | |
patch_stride | int | None | Stride of the patch. |
pad_at_start | bool | True | If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length. |
value | float | 0.0 | Value to pad the input tensor with. |
seq_len | int | None | Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride |
merge_dims | bool | True | If True, merge channels within the same patch. |
reduction | str | none | type of reduction applied. Available: “none”, “mean”, “min”, “max”, “mode” |
reduction_dim | int | -1 | dimension where the reduction is applied |
swap_dims | tuple | None | If True, swap the time and channel dimensions. |
= 17
seq_len = 10
patch_len = 5
patch_stride
= torch.arange(seq_len).reshape(1, 1, -1)
z11 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z12 = torch.cat((z11, z12), dim=1)
z1 = torch.arange(seq_len).reshape(1, 1, -1)
z21 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z22 = torch.cat((z21, z22), dim=1) + 1
z2 = torch.arange(seq_len).reshape(1, 1, -1)
z31 = torch.arange(seq_len).reshape(1, 1, -1) * 10
z32 = torch.cat((z31, z32), dim=1) + 2
z3 = torch.cat((z11, z21, z31), dim=0)
z = torch.cat((z1, z2, z3), dim=0)
z print(z.shape, "\n")
print(z)
= PatchEncoder(patch_len=patch_len, patch_stride=patch_stride, value=-1, seq_len=seq_len, merge_dims=True)
patch_encoder = patch_encoder(z)
output print(output.shape, "\n")
= output[..., 0]
first_token = torch.tensor([[-1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, 0, 10, 20, 30, 40,
expected_first_token 50, 60],
-1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, 1, 11, 21, 31, 41,
[51, 61],
-1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 2, 12, 22, 32, 42,
[52, 62]])
test_eq(first_token, expected_first_token)
torch.Size([3, 2, 17])
tensor([[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16],
[ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130,
140, 150, 160]],
[[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17],
[ 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 111, 121, 131,
141, 151, 161]],
[[ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18],
[ 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 102, 112, 122, 132,
142, 152, 162]]])
torch.Size([3, 20, 3])
TSPatchEncoder
TSPatchEncoder (patch_len:int, patch_stride:int=None, pad_at_start:bool=True, value:float=0.0, seq_len:int=None, merge_dims:bool=True, reduction:str='none', reduction_dim:int=-2, swap_dims:tuple=None)
Tansforms a time series into a sequence of patches along the last dimension
Type | Default | Details | |
---|---|---|---|
patch_len | int | Number of time steps in each patch. | |
patch_stride | int | None | Stride of the patch. |
pad_at_start | bool | True | If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length. |
value | float | 0.0 | Value to pad the input tensor with. |
seq_len | int | None | Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride |
merge_dims | bool | True | If True, merge channels within the same patch. |
reduction | str | none | type of reduction applied. Available: “none”, “mean”, “min”, “max”, “mode” |
reduction_dim | int | -2 | dimension where the y reduction is applied. |
swap_dims | tuple | None | If True, swap the time and channel dimensions. |
= 2
bs = 1
c_in = 10
seq_len = 4
patch_len
= TSTensor(torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len))
t print(t.data)
print(t.shape, "\n")
= TSPatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len)
patch_encoder = patch_encoder(t)
output 7]))
test_eq(output.shape, ([bs, patch_len, print("first patch:\n", output[..., 0].data, "\n")
= TSPatchEncoder(patch_len=patch_len, patch_stride=None, seq_len=seq_len)
patch_encoder = patch_encoder(t)
output 3]))
test_eq(output.shape, ([bs, patch_len, print("first patch:\n", output[..., 0].data, "\n")
tensor([[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],
[[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]])
torch.Size([2, 1, 10])
first patch:
tensor([[ 0, 1, 2, 3],
[10, 11, 12, 13]])
first patch:
tensor([[ 0, 0, 0, 1],
[ 0, 0, 10, 11]])
TSTuplePatchEncoder
TSTuplePatchEncoder (patch_len:int, patch_stride:int=None, pad_at_start:bool=True, value:float=0.0, seq_len:int=None, merge_dims:bool=True, reduction:str='none', reduction_dim:int=-2, swap_dims:tuple=None)
Tansforms a time series with x and y into sequences of patches along the last dimension
Type | Default | Details | |
---|---|---|---|
patch_len | int | Number of time steps in each patch. | |
patch_stride | int | None | Stride of the patch. |
pad_at_start | bool | True | If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length. |
value | float | 0.0 | Value to pad the input tensor with. |
seq_len | int | None | Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride |
merge_dims | bool | True | If True, merge y channels within the same patch. |
reduction | str | none | type of reduction applied to y. Available: “none”, “mean”, “min”, “max”, “mode” |
reduction_dim | int | -2 | dimension where the y reduction is applied. |
swap_dims | tuple | None | If True, swap the time and channel dimensions in y. |
# test
= 2
bs = 2
c_in = 10
seq_len = 4
patch_len
= torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len)
x = torch.arange(bs * c_in * seq_len).reshape(bs, c_in, seq_len) * 10
y print(x)
print(y)
= TSTuplePatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len, merge_dims=True)
patch_encoder = patch_encoder((x, y))
x_out, y_out * patch_len, 7]))
test_eq(x_out.shape, ([bs, c_in * patch_len, 7]))
test_eq(y_out.shape, ([bs, c_in print("first x patch:\n", x_out[..., 0].data, "\n")
print("first y patch:\n", y_out[..., 0].data, "\n")
= TSTuplePatchEncoder(patch_len=patch_len, patch_stride=1, seq_len=seq_len, merge_dims=False, reduction="max")
patch_encoder = patch_encoder((x, y))
x_out, y_out * patch_len, 7]))
test_eq(x_out.shape, ([bs, c_in 7]))
test_eq(y_out.shape, ([bs, c_in, print("first x patch:\n", x_out[..., 0].data, "\n")
print("first y patch:\n", y_out[..., 0].data, "\n")
tensor([[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]],
[[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]]])
tensor([[[ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
[100, 110, 120, 130, 140, 150, 160, 170, 180, 190]],
[[200, 210, 220, 230, 240, 250, 260, 270, 280, 290],
[300, 310, 320, 330, 340, 350, 360, 370, 380, 390]]])
first x patch:
tensor([[ 0, 1, 2, 3, 10, 11, 12, 13],
[20, 21, 22, 23, 30, 31, 32, 33]])
first y patch:
tensor([[ 0, 10, 20, 30, 100, 110, 120, 130],
[200, 210, 220, 230, 300, 310, 320, 330]])
first x patch:
tensor([[ 0, 1, 2, 3, 10, 11, 12, 13],
[20, 21, 22, 23, 30, 31, 32, 33]])
first y patch:
tensor([[ 30, 130],
[230, 330]])
sklearn API transforms
object2date
object2date (x, format=None)
TSShrinkDataFrame
TSShrinkDataFrame (columns=None, skip=None, obj2cat=True, int2uint=False, verbose=True)
A transformer to shrink dataframe or series memory usage
Type | Default | Details | |
---|---|---|---|
columns | NoneType | None | List[str], optional. Columns to shrink, all columns by default. |
skip | NoneType | None | List[str], optional. Columns to skip, None by default. |
obj2cat | bool | True | bool, optional. Convert object columns to category, True by default. |
int2uint | bool | False | bool, optional. Convert int columns to uint, False by default. |
verbose | bool | True | bool, optional. Print memory usage info. True by default. |
= pd.DataFrame()
df "ints64"] = np.random.randint(0,3,10)
df['floats64'] = np.random.rand(10)
df[= TSShrinkDataFrame()
tfm = tfm.fit_transform(df)
df "ints64"].dtype, "int8")
test_eq(df["floats64"].dtype, "float32") test_eq(df[
Initial memory usage: 288.00 B
Final memory usage : 178.00 B (-38.2%)
# test with date
= pd.DataFrame()
df "dates"] = pd.date_range('1/1/2011', periods=10, freq='M').astype(str)
df["ints64"] = np.random.randint(0,3,10)
df['floats64'] = np.random.rand(10)
df[= TSShrinkDataFrame()
tfm = tfm.fit_transform(df)
df "dates"].dtype, "datetime64[ns]")
test_eq(df["ints64"].dtype, "int8")
test_eq(df["floats64"].dtype, "float32") test_eq(df[
Initial memory usage: 368.00 B
Final memory usage : 258.00 B (-29.9%)
# test with date and series
= pd.DataFrame()
df "dates"] = pd.date_range('1/1/2011', periods=10, freq='M').astype(str)
df[= TSShrinkDataFrame()
tfm = tfm.fit_transform(df["dates"])
df "datetime64[ns]") test_eq(df.dtype,
Initial memory usage: 208.00 B
Final memory usage : 208.00 B (0.0%)
TSOneHotEncoder
TSOneHotEncoder (columns=None, drop=True, add_na=True, dtype=<class 'numpy.int8'>)
Encode categorical variables using one-hot encoding
Type | Default | Details | |
---|---|---|---|
columns | NoneType | None | (str or List[str], optional): Column name(s) to encode. If None, all columns will be encoded. Defaults to None. |
drop | bool | True | (bool, optional): Whether to drop the original columns after encoding. Defaults to True. |
add_na | bool | True | (bool, optional): Whether to add a ‘NaN’ category for missing values. Defaults to True. |
dtype | type | int8 | (type, optional): Data type of the encoded output. Defaults to np.int64. |
= pd.DataFrame()
df "a"] = np.random.randint(0,2,10)
df["b"] = np.random.randint(0,3,10)
df[= len(df["a"].unique()) + len(df["b"].unique())
unique_cols = TSOneHotEncoder()
tfm
tfm.fit(df)= tfm.transform(df)
df 1], unique_cols) test_eq(df.shape[
TSCategoricalEncoder
TSCategoricalEncoder (columns=None, add_na=True, sort=True, categories='auto', inplace=True, prefix=None, suffix=None, drop=False)
A transformer to encode categorical columns
Type | Default | Details | |
---|---|---|---|
columns | NoneType | None | List[str], optional. Columns to encode, all columns by default. |
add_na | bool | True | bool, optional. Add a NaN category, True by default. |
sort | bool | True | bool, optional. Sort categories by frequency, True by default. |
categories | str | auto | dict, optional. The custom mapping of categories. ‘auto’ by default. |
inplace | bool | True | bool, optional. Modify input DataFrame, True by default. |
prefix | NoneType | None | str, optional. Prefix for created column names. None by default. |
suffix | NoneType | None | str, optional. Suffix for created column names. None by default. |
drop | bool | False | bool, optional. Drop original columns, False by default. |
Stateful transforms like TSCategoricalEncoder can easily be serialized.
import joblib
= pd.DataFrame()
df "a"] = alphabet[np.random.randint(0,2,100)]
df["b"] = ALPHABET[np.random.randint(0,3,100)]
df[
display(df)= len(df["a"].unique())
a_unique = len(df["b"].unique())
b_unique = TSCategoricalEncoder()
tfm =slice(0, 50))
tfm.fit(df, idxs"data/TSCategoricalEncoder.joblib")
joblib.dump(tfm, = joblib.load("data/TSCategoricalEncoder.joblib")
tfm 0, "a"] = 'z'
df.loc[1, "a"] = 'h'
df.loc[= tfm.transform(df)
df
display(df)'a'].max(), a_unique)
test_eq(df['b'].max(), b_unique)
test_eq(df[= tfm.inverse_transform(df)
df display(df)
a | b | |
---|---|---|
0 | b | B |
1 | b | A |
2 | b | C |
3 | a | C |
4 | b | C |
... | ... | ... |
95 | a | A |
96 | a | A |
97 | a | B |
98 | a | A |
99 | b | B |
100 rows × 2 columns
a | b | |
---|---|---|
0 | 0 | 2 |
1 | 0 | 1 |
2 | 2 | 3 |
3 | 1 | 3 |
4 | 2 | 3 |
... | ... | ... |
95 | 1 | 1 |
96 | 1 | 1 |
97 | 1 | 2 |
98 | 1 | 1 |
99 | 2 | 2 |
100 rows × 2 columns
a | b | |
---|---|---|
0 | #na# | B |
1 | #na# | A |
2 | b | C |
3 | a | C |
4 | b | C |
... | ... | ... |
95 | a | A |
96 | a | A |
97 | a | B |
98 | a | A |
99 | b | B |
100 rows × 2 columns
= pd.DataFrame()
df "a"] = alphabet[np.random.randint(0,2,100)]
df["a"] = df["a"].astype('category')
df["b"] = ALPHABET[np.random.randint(0,3,100)]
df[
display(df)= len(df["a"].unique())
a_unique = len(df["b"].unique())
b_unique = TSCategoricalEncoder()
tfm
tfm.fit(df)"data/TSCategoricalEncoder.joblib")
joblib.dump(tfm, = joblib.load("data/TSCategoricalEncoder.joblib")
tfm "a"] = alphabet[np.random.randint(0,5,100)]
df["a"] = df["a"].astype('category')
df["b"] = ALPHABET[np.random.randint(0,3,100)]
df[
display(df)= tfm.transform(df)
df
display(df)'a'].max(), a_unique)
test_eq(df['b'].max(), b_unique)
test_eq(df[= tfm.inverse_transform(df)
df display(df)
a | b | |
---|---|---|
0 | b | B |
1 | a | C |
2 | b | C |
3 | a | C |
4 | b | B |
... | ... | ... |
95 | b | A |
96 | b | A |
97 | a | A |
98 | b | B |
99 | b | B |
100 rows × 2 columns
a | b | |
---|---|---|
0 | d | A |
1 | a | A |
2 | c | A |
3 | a | A |
4 | a | B |
... | ... | ... |
95 | c | C |
96 | d | B |
97 | c | A |
98 | b | B |
99 | e | B |
100 rows × 2 columns
a | b | |
---|---|---|
0 | 0 | 1 |
1 | 1 | 1 |
2 | 0 | 1 |
3 | 1 | 1 |
4 | 1 | 2 |
... | ... | ... |
95 | 0 | 3 |
96 | 0 | 2 |
97 | 0 | 1 |
98 | 2 | 2 |
99 | 0 | 2 |
100 rows × 2 columns
a | b | |
---|---|---|
0 | #na# | A |
1 | a | A |
2 | #na# | A |
3 | a | A |
4 | a | B |
... | ... | ... |
95 | #na# | C |
96 | #na# | B |
97 | #na# | A |
98 | b | B |
99 | #na# | B |
100 rows × 2 columns
= pd.DataFrame()
df "a"] = alphabet[np.random.randint(0,2,100)]
df["a"] = df["a"].astype('category')
df[= df['a']
s
display(s)= TSCategoricalEncoder()
tfm
tfm.fit(s)"data/TSCategoricalEncoder.joblib")
joblib.dump(tfm, = joblib.load("data/TSCategoricalEncoder.joblib")
tfm = tfm.transform(s)
s
display(s)= tfm.inverse_transform(s)
s display(s)
0 a
1 b
2 a
3 a
4 a
..
95 a
96 a
97 a
98 a
99 b
Name: a, Length: 100, dtype: category
Categories (2, object): ['a', 'b']
0 1
1 2
2 1
3 1
4 1
..
95 1
96 1
97 1
98 1
99 2
Length: 100, dtype: int8
0 a
1 b
2 a
3 a
4 a
..
95 a
96 a
97 a
98 a
99 b
Length: 100, dtype: object
TSTargetEncoder
TSTargetEncoder (target_column, columns=None, inplace=True, prefix=None, suffix=None, drop=True, dtypes=['object', 'category'])
*Mixin class for all transformers in scikit-learn.
This mixin defines the following functionality:
- a
fit_transform
method that delegates tofit
andtransform
; - a
set_output
method to outputX
as a specific container type.
If :term:get_feature_names_out
is defined, then :class:BaseEstimator
will automatically wrap transform
and fit_transform
to follow the set_output
API. See the :ref:developer_api_set_output
for details.
Type | Default | Details | |
---|---|---|---|
target_column | column containing the target | ||
columns | NoneType | None | List[str], optional. Columns to encode, all non-numerical columns by default. |
inplace | bool | True | bool, optional. Modify input DataFrame, True by default. |
prefix | NoneType | None | str, optional. Prefix for created column names. None by default. |
suffix | NoneType | None | str, optional. Suffix for created column names. None by default. |
drop | bool | True | bool, optional. Drop original columns, False by default. |
dtypes | list | [‘object’, ‘category’] | List[str]. List with dtypes that will be used to identify columns to encode if not explicitly passed. |
from sklearn.model_selection import train_test_split
# Create a dataframe with 100 rows
42)
np.random.seed(= pd.DataFrame({
df 'category1': np.random.choice(['cat', 'dog', 'rabbit'], 100),
'category2': np.random.choice(['large', 'small'], 100),
'continuous': np.random.rand(100),
'target': np.random.randint(0, 2, 100)
})
display(df)
# Split the data into train and test sets
= train_test_split(np.arange(100), test_size=0.2, random_state=42)
train_idx, test_idx print(train_idx.shape)
# Initialize the encoder
= TSTargetEncoder(columns=['category1', 'category2'], target_column='target', inplace=False, suffix="te", drop=False)
encoder
# Fit the encoder using the training data
=train_idx)
encoder.fit(df, idxs
# Transform the whole dataframe
= encoder.transform(df)
df_encoded
# Check the results
for c in ["category1", "category2"]:
for v in df[c].unique():
assert df.loc[train_idx][df.loc[train_idx, c] == v]["target"].mean() == df_encoded[df_encoded[c] == v][f"{c}_te"].mean()
df_encoded
(80,)
category1 | category2 | continuous | target | |
---|---|---|---|---|
0 | rabbit | small | 0.896091 | 0 |
1 | cat | small | 0.318003 | 1 |
2 | rabbit | small | 0.110052 | 1 |
3 | rabbit | large | 0.227935 | 0 |
4 | cat | large | 0.427108 | 0 |
... | ... | ... | ... | ... |
95 | cat | small | 0.325400 | 0 |
96 | cat | large | 0.746491 | 0 |
97 | rabbit | small | 0.649633 | 1 |
98 | cat | small | 0.849223 | 0 |
99 | cat | large | 0.657613 | 1 |
100 rows × 4 columns
category1 | category2 | continuous | target | category1_te | category2_te | |
---|---|---|---|---|---|---|
0 | rabbit | small | 0.896091 | 0 | 0.565217 | 0.500000 |
1 | cat | small | 0.318003 | 1 | 0.555556 | 0.500000 |
2 | rabbit | small | 0.110052 | 1 | 0.565217 | 0.500000 |
3 | rabbit | large | 0.227935 | 0 | 0.565217 | 0.521739 |
4 | cat | large | 0.427108 | 0 | 0.555556 | 0.521739 |
... | ... | ... | ... | ... | ... | ... |
95 | cat | small | 0.325400 | 0 | 0.555556 | 0.500000 |
96 | cat | large | 0.746491 | 0 | 0.555556 | 0.521739 |
97 | rabbit | small | 0.649633 | 1 | 0.565217 | 0.500000 |
98 | cat | small | 0.849223 | 0 | 0.555556 | 0.500000 |
99 | cat | large | 0.657613 | 1 | 0.555556 | 0.521739 |
100 rows × 6 columns
TSDateTimeEncoder
TSDateTimeEncoder (datetime_columns=None, prefix=None, drop=True, time=False, attr=['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'])
*Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>
.*
import datetime as dt
= pd.DataFrame()
df 0, "date"] = dt.datetime.now()
df.loc[1, "date"] = dt.datetime.now() + pd.Timedelta(1, unit="D")
df.loc[= TSDateTimeEncoder()
tfm "data/TSDateTimeEncoder.joblib")
joblib.dump(tfm, = joblib.load("data/TSDateTimeEncoder.joblib")
tfm tfm.fit_transform(df)
_Year | _Month | _Week | _Day | _Dayofweek | _Dayofyear | _Is_month_end | _Is_month_start | _Is_quarter_end | _Is_quarter_start | _Is_year_end | _Is_year_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023 | 6 | 24 | 17 | 5 | 168 | False | False | False | False | False | False |
1 | 2023 | 6 | 24 | 18 | 6 | 169 | False | False | False | False | False | False |
TSDropIfTrueCols
TSDropIfTrueCols (columns=None)
*Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>
.*
# test TSDropIfTrueCols
= pd.DataFrame()
df "a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]
df[
= pd.DataFrame()
expected_output "b"] = [0, 0, 0, 0]
expected_output["c"] = [0, 1, 0, 1]
expected_output[
= TSDropIfTrueCols("a")
tfm = tfm.fit_transform(df)
output test_eq(output, expected_output),
(None,)
TSApplyFunction
TSApplyFunction (function, groups=None, group_keys=False, axis=1, columns=None, reset_index=False, drop=True)
*Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>
.*
= pd.DataFrame()
df "a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]
df[
apply(lambda x: 1, ) df.
a 1
b 1
c 1
dtype: int64
# test ApplyFunction without groups
= pd.DataFrame()
df "a"] = [0, 0, 1, 0, 0]
df["b"] = [0, 0, 0, 0, 0]
df["c"] = [0, 1, 0, 0, 1]
df[
= pd.Series([1,1,1])
expected_output
= TSApplyFunction(lambda x: 1, axis=0, reset_index=True)
tfm = tfm.fit_transform(df)
output test_eq(output, expected_output)
# test ApplyFunction with groups and square function
= pd.DataFrame()
df "a"] = [0, 1, 2, 3, 4]
df["id"] = [0, 0, 0, 1, 1]
df[
= pd.Series([5, 25])
expected_output
= TSApplyFunction(lambda x: (x["a"]**2).sum(), groups="id")
tfm
= tfm.fit_transform(df)
output test_eq(output, expected_output)
TSMissingnessEncoder
TSMissingnessEncoder (columns=None)
*Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>
.*
= np.random.rand(10,3)
data > .8] = np.nan
data[data = pd.DataFrame(data, columns=["a", "b", "c"])
df = TSMissingnessEncoder()
tfm
tfm.fit(df)"data/TSMissingnessEncoder.joblib")
joblib.dump(tfm, = joblib.load("data/TSMissingnessEncoder.joblib")
tfm = tfm.transform(df)
df df
a | b | c | a_missing | b_missing | c_missing | |
---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | 1 | 1 | 1 |
1 | 0.511342 | 0.501516 | 0.798295 | 0 | 0 | 0 |
2 | 0.649964 | 0.701967 | 0.795793 | 0 | 0 | 0 |
3 | NaN | 0.337995 | 0.375583 | 1 | 0 | 0 |
4 | 0.093982 | 0.578280 | 0.035942 | 0 | 0 | 0 |
5 | 0.465598 | 0.542645 | 0.286541 | 0 | 0 | 0 |
6 | 0.590833 | 0.030500 | 0.037348 | 0 | 0 | 0 |
7 | NaN | 0.360191 | 0.127061 | 1 | 0 | 0 |
8 | 0.522243 | 0.769994 | 0.215821 | 0 | 0 | 0 |
9 | 0.622890 | 0.085347 | 0.051682 | 0 | 0 | 0 |
TSSortByColumns
TSSortByColumns (columns, ascending=True, inplace=True, kind='stable', na_position='last', ignore_index=False, key=None)
Transforms a dataframe by sorting by columns.
Type | Default | Details | |
---|---|---|---|
columns | Columns to sort by | ||
ascending | bool | True | Ascending or descending |
inplace | bool | True | Perform operation in place |
kind | str | stable | Type of sort to use |
na_position | str | last | Where to place NaNs |
ignore_index | bool | False | Do not preserve index |
key | NoneType | None | Function to apply to values before sorting |
# Test
= pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df = df.copy()
df_ori = TSSortByColumns(["a", "b"])
tfm = tfm.fit_transform(df)
df "a", "b"]).values, df.values) test_eq(df_ori.sort_values([
TSSelectColumns
TSSelectColumns (columns)
Transform used to select columns
Details | |
---|---|
columns | str or List[str]. Selected columns. |
# Test
= pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df = df.copy()
df_ori = TSSelectColumns(["a", "b"])
tfm = tfm.fit_transform(df)
df "a", "b"]].values, df.values)
test_eq(df_ori[[= tfm.inverse_transform(df) df
TSStepsSinceStart
TSStepsSinceStart (datetime_col, datetime_unit='D', start_datetime=None, drop=False, dtype=None)
Add a column indicating the number of steps since the start in each row
# Test
= pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df "datetime"] = pd.date_range("2020-01-01", periods=10)
df[
display(df)= df.copy()
df_ori = TSStepsSinceStart("datetime", datetime_unit="D", drop=True, dtype=np.int32)
tfm = tfm.fit_transform(df)
df
display(df)"days_since_start"].values, np.arange(10))
test_eq(df[= tfm.inverse_transform(df)
df test_eq(df_ori.values, df.values)
a | b | c | datetime | |
---|---|---|---|---|
0 | 0.643288 | 0.458253 | 0.545617 | 2020-01-01 |
1 | 0.941465 | 0.386103 | 0.961191 | 2020-01-02 |
2 | 0.905351 | 0.195791 | 0.069361 | 2020-01-03 |
3 | 0.100778 | 0.018222 | 0.094443 | 2020-01-04 |
4 | 0.683007 | 0.071189 | 0.318976 | 2020-01-05 |
5 | 0.844875 | 0.023272 | 0.814468 | 2020-01-06 |
6 | 0.281855 | 0.118165 | 0.696737 | 2020-01-07 |
7 | 0.628943 | 0.877472 | 0.735071 | 2020-01-08 |
8 | 0.803481 | 0.282035 | 0.177440 | 2020-01-09 |
9 | 0.750615 | 0.806835 | 0.990505 | 2020-01-10 |
a | b | c | days_since_start | |
---|---|---|---|---|
0 | 0.643288 | 0.458253 | 0.545617 | 0 |
1 | 0.941465 | 0.386103 | 0.961191 | 1 |
2 | 0.905351 | 0.195791 | 0.069361 | 2 |
3 | 0.100778 | 0.018222 | 0.094443 | 3 |
4 | 0.683007 | 0.071189 | 0.318976 | 4 |
5 | 0.844875 | 0.023272 | 0.814468 | 5 |
6 | 0.281855 | 0.118165 | 0.696737 | 6 |
7 | 0.628943 | 0.877472 | 0.735071 | 7 |
8 | 0.803481 | 0.282035 | 0.177440 | 8 |
9 | 0.750615 | 0.806835 | 0.990505 | 9 |
TSStandardScaler
TSStandardScaler (columns=None, mean=None, std=None, eps=1e-06)
Scale the values of specified columns in the input DataFrame to have a mean of 0 and standard deviation of 1.
Type | Default | Details | |
---|---|---|---|
columns | NoneType | None | Column name(s) to be transformed. If None, all columns are transformed. Defaults to None. |
mean | NoneType | None | Mean value for each column. If None, the mean value of each column is calculated during the fit method. Defaults to None. |
std | NoneType | None | Stdev value for each column. If None, the standard deviation value of each column is calculated during the fit method. Defaults to None. |
eps | float | 1e-06 | A small value to avoid division by zero. Defaults to 1e-6. |
# Test
= pd.DataFrame(np.random.rand(100,3), columns=["a", "b", "c"])
df = TSStandardScaler()
tfm = tfm.fit_transform(df)
df 3), 1e-3)
test_close(df.mean().values, np.zeros(3), 1e-3) test_close(df.std().values, np.ones(
# Test
= pd.DataFrame(np.random.rand(1000,3), columns=["a", "b", "c"])
df = TSStandardScaler()
tfm = tfm.fit_transform(df, idxs=slice(0, 800))
df 3), 1e-1)
test_close(df.mean().values, np.zeros(3), 1e-1) test_close(df.std().values, np.ones(
TSRobustScaler
TSRobustScaler (columns=None, quantile_range=(25.0, 75.0), eps=1e-06)
This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)
# test RobustScaler
= pd.DataFrame(np.random.rand(100,3), columns=["a", "b", "c"])
df "a"] = df["a"] * 100
df["b"] = df["b"] * 10
df[= TSRobustScaler()
tfm = tfm.fit_transform(df)
df 3), 1e-3) test_close(df.median().values, np.zeros(
TSAddMissingTimestamps
TSAddMissingTimestamps (datetime_col=None, use_index=False, unique_id_cols=None, fill_value=nan, range_by_group=True, start_date=None, end_date=None, freq=None)
*Mixin class for all transformers in scikit-learn.
This mixin defines the following functionality:
- a
fit_transform
method that delegates tofit
andtransform
; - a
set_output
method to outputX
as a specific container type.
If :term:get_feature_names_out
is defined, then :class:BaseEstimator
will automatically wrap transform
and fit_transform
to follow the set_output
API. See the :ref:developer_api_set_output
for details.
:class:OneToOneFeatureMixin
and :class:ClassNamePrefixFeaturesOutMixin
are helpful mixins for defining :term:get_feature_names_out
.*
# Test
= pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df "datetime"] = pd.date_range("2020-01-01", periods=10)
df[= df.iloc[[0, 2, 3, 5, 6, 8, 9]]
df
display(df)= TSAddMissingTimestamps(datetime_col="datetime", freq="D")
tfm = tfm.fit_transform(df)
df
display(df)0], 10) test_eq(df.shape[
a | b | c | datetime | |
---|---|---|---|---|
0 | 0.211126 | 0.752468 | 0.051294 | 2020-01-01 |
2 | 0.394572 | 0.529941 | 0.161367 | 2020-01-03 |
3 | 0.571996 | 0.805432 | 0.760161 | 2020-01-04 |
5 | 0.361075 | 0.408456 | 0.679697 | 2020-01-06 |
6 | 0.056680 | 0.034673 | 0.391911 | 2020-01-07 |
8 | 0.259828 | 0.886086 | 0.895690 | 2020-01-09 |
9 | 0.297287 | 0.229994 | 0.411304 | 2020-01-10 |
datetime | a | b | c | |
---|---|---|---|---|
0 | 2020-01-01 | 0.211126 | 0.752468 | 0.051294 |
1 | 2020-01-02 | NaN | NaN | NaN |
2 | 2020-01-03 | 0.394572 | 0.529941 | 0.161367 |
3 | 2020-01-04 | 0.571996 | 0.805432 | 0.760161 |
4 | 2020-01-05 | NaN | NaN | NaN |
5 | 2020-01-06 | 0.361075 | 0.408456 | 0.679697 |
6 | 2020-01-07 | 0.056680 | 0.034673 | 0.391911 |
7 | 2020-01-08 | NaN | NaN | NaN |
8 | 2020-01-09 | 0.259828 | 0.886086 | 0.895690 |
9 | 2020-01-10 | 0.297287 | 0.229994 | 0.411304 |
# Test
# Filling dates between min and max dates for each value in groupby column
= pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
dates = np.zeros((len(dates), 4))
data 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
data[:, = ['date', 'id', 'feature1', 'feature2']
cols = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)
date_df_with_missing_dates
display(date_df_with_missing_dates)= TSAddMissingTimestamps(datetime_col="date", unique_id_cols="id", freq="D")
tfm = tfm.fit_transform(date_df_with_missing_dates.copy())
df display(df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-03 | 0 | 0.826065 | 0.793818 |
1 | 2021-05-05 | 0 | 0.824350 | 0.577807 |
2 | 2021-05-06 | 0 | 0.396992 | 0.866102 |
3 | 2021-05-07 | 0 | 0.156317 | 0.289440 |
4 | 2021-05-01 | 1 | 0.737951 | 0.467681 |
5 | 2021-05-03 | 1 | 0.671271 | 0.411190 |
6 | 2021-05-04 | 1 | 0.270644 | 0.427486 |
7 | 2021-05-06 | 1 | 0.992582 | 0.564232 |
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-03 | 0 | 0.826065 | 0.793818 |
1 | 2021-05-04 | 0 | NaN | NaN |
2 | 2021-05-05 | 0 | 0.824350 | 0.577807 |
3 | 2021-05-06 | 0 | 0.396992 | 0.866102 |
4 | 2021-05-07 | 0 | 0.156317 | 0.289440 |
5 | 2021-05-01 | 1 | 0.737951 | 0.467681 |
6 | 2021-05-02 | 1 | NaN | NaN |
7 | 2021-05-03 | 1 | 0.671271 | 0.411190 |
8 | 2021-05-04 | 1 | 0.270644 | 0.427486 |
9 | 2021-05-05 | 1 | NaN | NaN |
10 | 2021-05-06 | 1 | 0.992582 | 0.564232 |
# Test
display(date_df_with_missing_dates)= TSAddMissingTimestamps(datetime_col="date", unique_id_cols="id", freq="D", range_by_group=False)
tfm = tfm.fit_transform(date_df_with_missing_dates.copy())
df display(df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-03 | 0 | 0.826065 | 0.793818 |
1 | 2021-05-05 | 0 | 0.824350 | 0.577807 |
2 | 2021-05-06 | 0 | 0.396992 | 0.866102 |
3 | 2021-05-07 | 0 | 0.156317 | 0.289440 |
4 | 2021-05-01 | 1 | 0.737951 | 0.467681 |
5 | 2021-05-03 | 1 | 0.671271 | 0.411190 |
6 | 2021-05-04 | 1 | 0.270644 | 0.427486 |
7 | 2021-05-06 | 1 | 0.992582 | 0.564232 |
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 | 0 | NaN | NaN |
1 | 2021-05-02 | 0 | NaN | NaN |
2 | 2021-05-03 | 0 | 0.826065 | 0.793818 |
3 | 2021-05-04 | 0 | NaN | NaN |
4 | 2021-05-05 | 0 | 0.824350 | 0.577807 |
5 | 2021-05-06 | 0 | 0.396992 | 0.866102 |
6 | 2021-05-07 | 0 | 0.156317 | 0.289440 |
7 | 2021-05-01 | 1 | 0.737951 | 0.467681 |
8 | 2021-05-02 | 1 | NaN | NaN |
9 | 2021-05-03 | 1 | 0.671271 | 0.411190 |
10 | 2021-05-04 | 1 | 0.270644 | 0.427486 |
11 | 2021-05-05 | 1 | NaN | NaN |
12 | 2021-05-06 | 1 | 0.992582 | 0.564232 |
13 | 2021-05-07 | 1 | NaN | NaN |
TSDropDuplicates
TSDropDuplicates (datetime_col=None, use_index=False, unique_id_cols=None, keep='last', reset_index=False)
Drop rows with duplicated values in a set of columns, optionally including a datetime column or index
Type | Default | Details | |
---|---|---|---|
datetime_col | NoneType | None | (str or List[str], optional): Name(s) of column(s) containing datetime values. If None, the index is used if use_index=True. |
use_index | bool | False | (bool, optional): Whether to include the index in the set of columns for checking duplicates. Defaults to False. |
unique_id_cols | NoneType | None | (str or List[str], optional): Name(s) of column(s) to be included in the set of columns for checking duplicates. Defaults to None. |
keep | str | last | (str, optional): Which duplicated values to keep. Choose from {‘first’, ‘last’, False}. Defaults to ‘last’. |
reset_index | bool | False | (bool, optional): Whether to reset the index after dropping duplicates. Ignored if use_index=False. Defaults to False. |
# Test
= pd.DataFrame(np.random.rand(10,3), columns=["a", "b", "c"])
df "datetime"] = pd.date_range("2020-01-01", periods=10)
df['user_id'] = np.sort(np.random.randint(0, 2, 10))
df[= df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9]]
df =True, inplace=True)
df.reset_index(drop
display(df)= TSDropDuplicates(datetime_col="datetime", unique_id_cols="a")
tfm = tfm.fit_transform(df)
df display(df)
a | b | c | datetime | user_id | |
---|---|---|---|---|---|
0 | 0.201528 | 0.934433 | 0.689088 | 2020-01-01 | 0 |
1 | 0.016200 | 0.818380 | 0.040139 | 2020-01-03 | 0 |
2 | 0.016200 | 0.818380 | 0.040139 | 2020-01-03 | 0 |
3 | 0.889913 | 0.991963 | 0.294067 | 2020-01-04 | 0 |
4 | 0.865562 | 0.102843 | 0.125955 | 2020-01-06 | 1 |
5 | 0.979152 | 0.673839 | 0.846887 | 2020-01-07 | 1 |
6 | 0.979152 | 0.673839 | 0.846887 | 2020-01-07 | 1 |
7 | 0.603150 | 0.682532 | 0.575359 | 2020-01-09 | 1 |
8 | 0.429062 | 0.275923 | 0.768581 | 2020-01-10 | 1 |
a | b | c | datetime | user_id | |
---|---|---|---|---|---|
0 | 0.201528 | 0.934433 | 0.689088 | 2020-01-01 | 0 |
2 | 0.016200 | 0.818380 | 0.040139 | 2020-01-03 | 0 |
3 | 0.889913 | 0.991963 | 0.294067 | 2020-01-04 | 0 |
4 | 0.865562 | 0.102843 | 0.125955 | 2020-01-06 | 1 |
6 | 0.979152 | 0.673839 | 0.846887 | 2020-01-07 | 1 |
7 | 0.603150 | 0.682532 | 0.575359 | 2020-01-09 | 1 |
8 | 0.429062 | 0.275923 | 0.768581 | 2020-01-10 | 1 |
TSFillMissing
TSFillMissing (columns=None, unique_id_cols=None, method='ffill', value=0)
Fill missing values in specified columns using the specified method and/ or value.
Type | Default | Details | |
---|---|---|---|
columns | NoneType | None | (str or List[str], optional): Column name(s) to be transformed. If None, all columns are transformed. Defaults to None. |
unique_id_cols | NoneType | None | (str or List[str], optional): Col name(s) with unique ids for each row. If None, uses all rows at once. Defaults to None . |
method | str | ffill | (str, optional): The method to use for filling missing values, e.g. ‘ffill’, ‘bfill’. If None, value is used. Defaults to None. |
value | int | 0 | (scalar or dict or Series, optional): The value to use for filling missing values. If None, method is used. Defaults to None. |
# Test
= pd.DataFrame(np.random.rand(20,3), columns=["a", "b", "c"])
df 20) > .5, 'a'] = np.nan
df.loc[np.random.rand("datetime"] = pd.date_range("2020-01-01", periods=20)
df['user_id'] = np.sort(np.random.randint(0, 2, 20))
df[= df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
df =True, inplace=True)
df.reset_index(drop
display(df)= TSFillMissing(columns="a", method="ffill", value=0)
tfm = tfm.fit_transform(df)
df
display(df)'a'].isna().sum(), 0) test_eq(df[
a | b | c | datetime | user_id | |
---|---|---|---|---|---|
0 | NaN | 0.059943 | 0.130974 | 2020-01-01 | 0 |
1 | 0.734151 | 0.341319 | 0.478528 | 2020-01-03 | 0 |
2 | 0.734151 | 0.341319 | 0.478528 | 2020-01-03 | 0 |
3 | 0.928860 | 0.331972 | 0.465337 | 2020-01-04 | 0 |
4 | NaN | 0.631375 | 0.426398 | 2020-01-06 | 0 |
5 | 0.548145 | 0.174647 | 0.295932 | 2020-01-07 | 0 |
6 | 0.548145 | 0.174647 | 0.295932 | 2020-01-07 | 0 |
7 | NaN | 0.576881 | 0.563920 | 2020-01-09 | 0 |
8 | 0.500279 | 0.069394 | 0.089877 | 2020-01-10 | 0 |
9 | 0.600912 | 0.340959 | 0.917268 | 2020-01-11 | 0 |
10 | 0.406591 | 0.143281 | 0.714719 | 2020-01-12 | 0 |
11 | NaN | 0.525470 | 0.697833 | 2020-01-13 | 1 |
12 | NaN | 0.792191 | 0.676361 | 2020-01-14 | 1 |
13 | NaN | 0.945925 | 0.295824 | 2020-01-15 | 1 |
14 | NaN | 0.271955 | 0.217891 | 2020-01-16 | 1 |
15 | NaN | 0.633712 | 0.593461 | 2020-01-17 | 1 |
16 | 0.016243 | 0.728778 | 0.323530 | 2020-01-18 | 1 |
17 | NaN | 0.556578 | 0.342731 | 2020-01-19 | 1 |
18 | 0.134576 | 0.094419 | 0.831518 | 2020-01-20 | 1 |
a | b | c | datetime | user_id | |
---|---|---|---|---|---|
0 | 0.000000 | 0.059943 | 0.130974 | 2020-01-01 | 0 |
1 | 0.734151 | 0.341319 | 0.478528 | 2020-01-03 | 0 |
2 | 0.734151 | 0.341319 | 0.478528 | 2020-01-03 | 0 |
3 | 0.928860 | 0.331972 | 0.465337 | 2020-01-04 | 0 |
4 | 0.928860 | 0.631375 | 0.426398 | 2020-01-06 | 0 |
5 | 0.548145 | 0.174647 | 0.295932 | 2020-01-07 | 0 |
6 | 0.548145 | 0.174647 | 0.295932 | 2020-01-07 | 0 |
7 | 0.548145 | 0.576881 | 0.563920 | 2020-01-09 | 0 |
8 | 0.500279 | 0.069394 | 0.089877 | 2020-01-10 | 0 |
9 | 0.600912 | 0.340959 | 0.917268 | 2020-01-11 | 0 |
10 | 0.406591 | 0.143281 | 0.714719 | 2020-01-12 | 0 |
11 | 0.406591 | 0.525470 | 0.697833 | 2020-01-13 | 1 |
12 | 0.406591 | 0.792191 | 0.676361 | 2020-01-14 | 1 |
13 | 0.406591 | 0.945925 | 0.295824 | 2020-01-15 | 1 |
14 | 0.406591 | 0.271955 | 0.217891 | 2020-01-16 | 1 |
15 | 0.406591 | 0.633712 | 0.593461 | 2020-01-17 | 1 |
16 | 0.016243 | 0.728778 | 0.323530 | 2020-01-18 | 1 |
17 | 0.016243 | 0.556578 | 0.342731 | 2020-01-19 | 1 |
18 | 0.134576 | 0.094419 | 0.831518 | 2020-01-20 | 1 |
TSMissingnessEncoder
TSMissingnessEncoder (columns=None)
*Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCV
and friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>
.*
# Test
= pd.DataFrame(np.random.rand(20,3), columns=["a", "b", "c"])
df 20) > .5, 'a'] = np.nan
df.loc[np.random.rand("datetime"] = pd.date_range("2020-01-01", periods=20)
df['user_id'] = np.sort(np.random.randint(0, 2, 20))
df[= df.iloc[[0, 2, 2, 3, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
df =True, inplace=True)
df.reset_index(drop
display(df)= TSMissingnessEncoder(columns="a")
tfm = tfm.fit_transform(df)
df display(df)
a | b | c | datetime | user_id | |
---|---|---|---|---|---|
0 | 0.873619 | 0.995569 | 0.582714 | 2020-01-01 | 0 |
1 | 0.402704 | 0.672507 | 0.682192 | 2020-01-03 | 0 |
2 | 0.402704 | 0.672507 | 0.682192 | 2020-01-03 | 0 |
3 | NaN | 0.133210 | 0.632396 | 2020-01-04 | 0 |
4 | 0.700611 | 0.753472 | 0.872859 | 2020-01-06 | 0 |
5 | NaN | 0.730249 | 0.619173 | 2020-01-07 | 0 |
6 | NaN | 0.730249 | 0.619173 | 2020-01-07 | 0 |
7 | NaN | 0.617106 | 0.849959 | 2020-01-09 | 0 |
8 | 0.196246 | 0.125550 | 0.963480 | 2020-01-10 | 1 |
9 | 0.108045 | 0.478491 | 0.585564 | 2020-01-11 | 1 |
10 | NaN | 0.086032 | 0.057027 | 2020-01-12 | 1 |
11 | 0.105483 | 0.585588 | 0.544345 | 2020-01-13 | 1 |
12 | 0.233741 | 0.637774 | 0.820068 | 2020-01-14 | 1 |
13 | NaN | 0.498130 | 0.689310 | 2020-01-15 | 1 |
14 | NaN | 0.307771 | 0.613638 | 2020-01-16 | 1 |
15 | 0.897935 | 0.809924 | 0.583130 | 2020-01-17 | 1 |
16 | 0.730222 | 0.364822 | 0.640966 | 2020-01-18 | 1 |
17 | 0.466182 | 0.189936 | 0.701738 | 2020-01-19 | 1 |
18 | NaN | 0.358622 | 0.911339 | 2020-01-20 | 1 |
a | b | c | datetime | user_id | a_missing | |
---|---|---|---|---|---|---|
0 | 0.873619 | 0.995569 | 0.582714 | 2020-01-01 | 0 | 0 |
1 | 0.402704 | 0.672507 | 0.682192 | 2020-01-03 | 0 | 0 |
2 | 0.402704 | 0.672507 | 0.682192 | 2020-01-03 | 0 | 0 |
3 | NaN | 0.133210 | 0.632396 | 2020-01-04 | 0 | 1 |
4 | 0.700611 | 0.753472 | 0.872859 | 2020-01-06 | 0 | 0 |
5 | NaN | 0.730249 | 0.619173 | 2020-01-07 | 0 | 1 |
6 | NaN | 0.730249 | 0.619173 | 2020-01-07 | 0 | 1 |
7 | NaN | 0.617106 | 0.849959 | 2020-01-09 | 0 | 1 |
8 | 0.196246 | 0.125550 | 0.963480 | 2020-01-10 | 1 | 0 |
9 | 0.108045 | 0.478491 | 0.585564 | 2020-01-11 | 1 | 0 |
10 | NaN | 0.086032 | 0.057027 | 2020-01-12 | 1 | 1 |
11 | 0.105483 | 0.585588 | 0.544345 | 2020-01-13 | 1 | 0 |
12 | 0.233741 | 0.637774 | 0.820068 | 2020-01-14 | 1 | 0 |
13 | NaN | 0.498130 | 0.689310 | 2020-01-15 | 1 | 1 |
14 | NaN | 0.307771 | 0.613638 | 2020-01-16 | 1 | 1 |
15 | 0.897935 | 0.809924 | 0.583130 | 2020-01-17 | 1 | 0 |
16 | 0.730222 | 0.364822 | 0.640966 | 2020-01-18 | 1 | 0 |
17 | 0.466182 | 0.189936 | 0.701738 | 2020-01-19 | 1 | 0 |
18 | NaN | 0.358622 | 0.911339 | 2020-01-20 | 1 | 1 |
With these sklearn preprocessing API transforms it’s possible to build data preprocessing pipelines like this one:
from sklearn.pipeline import Pipeline
= ['cont_0', 'cont_1', 'cont_2', 'cont_3', 'cont_4', 'cont_5']
cont_cols = Pipeline([
pipe 'shrinker', TSShrinkDataFrame()),
('drop_duplicates', TSDropDuplicates('date', unique_id_cols='user_id')),
('add_mts', TSAddMissingTimestamps(datetime_col='date', unique_id_cols='user_id', freq='D', range_by_group=False)),
('onehot_encoder', TSOneHotEncoder(['cat_0'])),
('cat_encoder', TSCategoricalEncoder(['user_id', 'cat_1'])),
('steps_since_start', TSStepsSinceStart('date', datetime_unit='D', start_datetime='2017-01-01'), dtype=np.int32),
('missing_encoder', TSMissingnessEncoder(['cont_1'])),
('fill_missing', TSFillMissing(cont_cols, unique_id_cols='user_id', value=0)),
(
], =True)
verbose= pipe.fit_transform(df) df
y transforms
Preprocessor
Preprocessor (preprocessor, **kwargs)
Initialize self. See help(type(self)) for accurate signature.
# Standardize
from tsai.data.validation import TimeSplitter
= random_shuffle(np.random.randn(1000) * 10 + 5)
y = TimeSplitter()(y)
splits = Preprocessor(StandardScaler)
preprocessor 0]])
preprocessor.fit(y[splits[= preprocessor.transform(y)
y_tfm
test_close(preprocessor.inverse_transform(y_tfm), y)50, label='ori',)
plt.hist(y, 50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc plt.show()
# RobustScaler
= random_shuffle(np.random.randn(1000) * 10 + 5)
y = TimeSplitter()(y)
splits = Preprocessor(RobustScaler)
preprocessor 0]])
preprocessor.fit(y[splits[= preprocessor.transform(y)
y_tfm
test_close(preprocessor.inverse_transform(y_tfm), y)50, label='ori',)
plt.hist(y, 50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc plt.show()
# Normalize
= random_shuffle(np.random.rand(1000) * 3 + .5)
y = TimeSplitter()(y)
splits = Preprocessor(Normalizer)
preprocessor 0]])
preprocessor.fit(y[splits[= preprocessor.transform(y)
y_tfm
test_close(preprocessor.inverse_transform(y_tfm), y)50, label='ori',)
plt.hist(y, 50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc plt.show()
# BoxCox
= random_shuffle(np.random.rand(1000) * 10 + 5)
y = TimeSplitter()(y)
splits = Preprocessor(BoxCox)
preprocessor 0]])
preprocessor.fit(y[splits[= preprocessor.transform(y)
y_tfm
test_close(preprocessor.inverse_transform(y_tfm), y)50, label='ori',)
plt.hist(y, 50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc plt.show()
# YeoJohnshon
= random_shuffle(np.random.randn(1000) * 10 + 5)
y = np.random.beta(.5, .5, size=1000)
y = TimeSplitter()(y)
splits = Preprocessor(YeoJohnshon)
preprocessor 0]])
preprocessor.fit(y[splits[= preprocessor.transform(y)
y_tfm
test_close(preprocessor.inverse_transform(y_tfm), y)50, label='ori',)
plt.hist(y, 50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc plt.show()
# QuantileTransformer
= - np.random.beta(1, .5, 10000) * 10
y = TimeSplitter()(y)
splits = Preprocessor(Quantile)
preprocessor 0]])
preprocessor.fit(y[splits[50, label='ori',)
plt.hist(y, = preprocessor.transform(y)
y_tfm ='best')
plt.legend(loc
plt.show()50, label='tfm')
plt.hist(y_tfm, ='best')
plt.legend(loc
plt.show()1e-1) test_close(preprocessor.inverse_transform(y_tfm), y,
ReLabeler
ReLabeler (cm)
Changes the labels in a dataset based on a dictionary (class mapping) Args: cm = class mapping dictionary
= {0:'a', 1:'b', 2:'c', 3:'d', 4:'e'}
vals = np.array([vals[i] for i in np.random.randint(0, 5, 20)])
y = ReLabeler(dict(a='x', b='x', c='y', d='z', e='z'))
labeler = labeler(y)
y_new
test_eq(y.shape, y_new.shape) y, y_new
(array(['d', 'd', 'a', 'd', 'b', 'e', 'a', 'd', 'b', 'c', 'b', 'e', 'b',
'b', 'a', 'e', 'd', 'e', 'c', 'e'], dtype='<U1'),
array(['z', 'z', 'x', 'z', 'x', 'z', 'x', 'z', 'x', 'y', 'x', 'z', 'x',
'x', 'x', 'z', 'z', 'z', 'y', 'z'], dtype='<U1'))