= np.arange(10)
a = np.arange(10, 20)
b False)
test_eq(check_overlap(a, b), = np.arange(10)
a = np.arange(9, 20)
b 9])
test_eq(check_overlap(a, b), [= np.arange(10)
a = np.arange(10, 20)
b = np.arange(20, 30)
c False)
test_eq(check_overlap(a, b, c), = np.arange(10)
a = np.arange(10, 20)
b = np.arange(10, 30)
c 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])) test_eq(check_overlap(a, b, c), ([], [], [
Spliting data
Functions required to perform cross-validation and transform unique time series sequence into multiple samples ready to be used by a time series model.
RandomSplitter
RandomSplitter (valid_pct=0.2, seed=None)
Create function that splits items
between train/val with valid_pct
randomly.
balance_idx
balance_idx (o, shuffle=False, strategy='oversample', random_state=None, verbose=False)
leakage_finder
leakage_finder (*splits, verbose=True)
You can pass splits as a tuple, or train, valid, …
check_splits_overlap
check_splits_overlap (splits)
check_overlap
check_overlap (a, b, c=None)
Checks if there’s overlap between array-like objects
= np.concatenate([[i] * np.random.randint(10, 100) for i in range(5)])
y = np.random.choice(len(y), int(len(y) * .8), False)
train_split = np.unique(y[train_split], return_counts=True)
c, v print(f"{'imbalanced:':25} {c} {v}")
= train_split[balance_idx(y[train_split], strategy="oversample")]
oversampled_train_split = np.unique(y[oversampled_train_split], return_counts=True)
osc, osv print(f"{'balanced (oversample):':25} {osc} {osv}")
max(v)] * len(v))
test_eq(osv, [
= train_split[balance_idx(y[train_split], strategy="undersample")]
undersampled_train_split = np.unique(y[undersampled_train_split], return_counts=True)
usc, usv print(f"{'balanced (undersample):':25} {usc} {usv}")
min(v)] * len(v)) test_eq(usv, [
imbalanced: [0 1 2 3 4] [24 43 64 41 8]
balanced (oversample): [0 1 2 3 4] [64 64 64 64 64]
balanced (undersample): [0 1 2 3 4] [8 8 8 8 8]
= L(list(concat(np.zeros(5), np.ones(10)).astype(int)))
l = balance_idx(l)
balanced_idx 0.5)
test_eq(np.mean(l[balanced_idx]), isinstance(balanced_idx, L), True)
test_eq(
= list(concat(np.zeros(5), np.ones(10)).astype(int))
l = balance_idx(l)
balanced_idx 0.5)
test_eq(np.mean(L(l)[balanced_idx]), isinstance(balanced_idx, L), True)
test_eq(
= concat(np.zeros(5), np.ones(10)).astype(int)
a = balance_idx(a)
balanced_idx 0.5)
test_eq(np.mean(a[balanced_idx]), isinstance(balanced_idx, L), True)
test_eq(
= concat(torch.zeros(5), torch.ones(10))
t = balance_idx(t, shuffle=True)
balanced_idx 0.5)
test_eq(t[balanced_idx].mean(), isinstance(balanced_idx, L), True) test_eq(
= np.arange(100_000), np.arange(100_000, 200_000) a, b
= True
soft_labels = .5
filter_pseudolabels = True
balanced_pseudolabels
= torch.rand(1000, 3)
pseudolabels = torch.softmax(pseudolabels, -1) if soft_labels else torch.argmax(pseudolabels, -1)
pseudolabels = torch.argmax(pseudolabels, -1) if soft_labels else pseudolabels
hpl
if filter_pseudolabels and pseudolabels.ndim > 1:
= 1 - pseudolabels.max(-1).values
error = np.arange(len(error))[error < filter_pseudolabels]
filt_pl_idx = pseudolabels[error < filter_pseudolabels]
filt_pl assert len(filt_pl) > 0, 'no filtered pseudolabels'
= torch.argmax(filt_pl, -1)
filt_hpl else:
= np.arange(len(pseudolabels))
filt_pl_idx = filt_hpl = pseudolabels filt_pl
= filt_pl_idx[balance_idx(filt_hpl)] if balanced_pseudolabels else filt_pl_idx
pl_split float().mean(), np.mean(np.unique(hpl))) test_eq(hpl[pl_split].
TrainValidTestSplitter
TrainValidTestSplitter (n_splits:int=1, valid_size:Union[float,int]=0.2, test_size:Union[float,int]=0.0, train_only:bool=False, stratify:bool=True, balance:bool=False, strategy:str='oversample', shuffle:bool=True, random_state:Optional[int]=None, verbose:bool=False, **kwargs)
Split items
into random train, valid (and test optional) subsets.
plot_splits
plot_splits (splits)
get_splits
get_splits (o, n_splits:int=1, valid_size:float=0.2, test_size:float=0.0, train_only:bool=False, train_size:Union[NoneType,float,int]=None, balance:bool=False, strategy:str='oversample', shuffle:bool=True, stratify:bool=True, check_splits:bool=True, random_state:Optional[int]=None, show_plot:bool=True, verbose:bool=False)
Arguments: o : object to which splits will be applied, usually target. n_splits : number of folds. Must be an int >= 1. valid_size : size of validation set. Only used if n_splits = 1. If n_splits > 1 valid_size = (1. - test_size) / n_splits. test_size : size of test set. Default = 0. train_only : if True valid set == train set. This may be useful for debugging purposes. train_size : size of the train set used. Default = None (the remainder after assigning both valid and test). Useful for to get learning curves with different train sizes or get a small batch to debug a neural net. balance : whether to balance data so that train always contain the same number of items per class. strategy : strategy to balance data (“undersample” or “oversample”). Default = “oversample”. shuffle : whether to shuffle data before splitting into batches. Note that the samples within each split will be shuffle. stratify : whether to create folds preserving the percentage of samples for each class. check_splits : whether to perform leakage and completion checks. random_state : when shuffle is True, random_state affects the ordering of the indices. Pass an int for reproducible output. show_plot : plot the split distribution
= 5
n_splits = 0.2
valid_size = 0.2
test_size = False # set to True for debugging (valid = train)
train_only = 5000
train_size = True
stratify = False
balance = True
shuffle = None
predefined_splits = True
show_plot
= True
check_splits = 23
random_state
= np.random.randint(0, 3, 10000) + 100
y
= get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, balance=balance, stratify=stratify,
splits =train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
train_only splits
(((#5000) [3490,2428,4475,8317,2802,6834,2954,7671,3383,9554...],
(#1600) [1680,6677,5879,4428,5511,8312,372,5127,7012,3021...],
(#2000) [1263,6498,1602,1838,1073,5304,1210,1037,8789,6175...]),
((#5000) [3442,4237,470,3901,3808,3793,6286,8546,6254,9530...],
(#1600) [9160,5451,3628,143,2054,7225,7124,8057,1405,5089...],
(#2000) [1263,6498,1602,1838,1073,5304,1210,1037,8789,6175...]),
((#5000) [9850,7451,7338,9742,3258,1527,4450,5678,2932,1693...],
(#1600) [6186,5970,376,7848,3786,1663,7193,3647,3277,553...],
(#2000) [1263,6498,1602,1838,1073,5304,1210,1037,8789,6175...]),
((#5000) [1853,7308,7375,3851,1852,3820,2601,3868,8718,7190...],
(#1600) [4182,6419,6265,4837,168,9627,2500,9951,1610,7547...],
(#2000) [1263,6498,1602,1838,1073,5304,1210,1037,8789,6175...]),
((#5000) [7878,6392,453,4817,4676,5738,6482,4033,8114,7337...],
(#1600) [7682,6416,2877,9164,1583,342,2916,4806,8776,2046...],
(#2000) [1263,6498,1602,1838,1073,5304,1210,1037,8789,6175...]))
=256
train_size= np.random.randint(0, 3, 1000) + 100
y = get_splits(y, train_size=train_size, train_only=True)
splits 0], splits[1])
test_eq(splits[len(splits[0]), train_size)
test_eq( splits
valid == train
((#256) [550,813,388,595,948,198,354,749,175,812...],
(#256) [550,813,388,595,948,198,354,749,175,812...])
get_walk_forward_splits
get_walk_forward_splits (o, n_splits=1, train_size=None, valid_size=0.2, test_size=0.0, anchored=False, gap=0.0, test_after_valid=True, random_state=None, show_plot=True)
Type | Default | Details | |
---|---|---|---|
o | 3D object with shape [samples x features x steps] containing the time series we need to split | ||
n_splits | int | 1 | # of splits |
train_size | NoneType | None | optional: training set size as an int or a float. None when using and anchored strategy. |
valid_size | float | 0.2 | validation set size as an int or a float |
test_size | float | 0.0 | test set size as an int or a float |
anchored | bool | False | starting point for train set remains the same for all splits |
gap | float | 0.0 | # of samples to exclude from the end of each train set before the validation set. Entered as an int or a float |
test_after_valid | bool | True | flag to indicate if validation and test will be samples randomly or sequentially |
random_state | NoneType | None | integer that can be used to generate reproducible results |
show_plot | bool | True | plots the splits created |
= np.random.rand(10_000, 3, 50) # shape: [samples x features x steps]
o
= get_walk_forward_splits(
splits
o, =4,
n_splits=.6,
train_size=0.1,
valid_size=0.1,
test_size= True,
anchored = 100,
gap = True,
test_after_valid = None,
random_state =True,
show_plot
)
= get_walk_forward_splits(
splits
o, =3,
n_splits=0.3,
train_size=0.1,
valid_size=0.1,
test_size= False,
anchored = 0.,
gap = False,
test_after_valid = None,
random_state =True,
show_plot )
TSSplitter
TSSplitter (valid_size=0.2, test_size=0.0, fcst_horizon=0, show_plot=True)
Create function that splits items
between train/val with valid_size
without shuffling data.
Type | Default | Details | |
---|---|---|---|
valid_size | float | 0.2 | int or float indicating the validation set size |
test_size | float | 0.0 | int or float indicating the test set size |
fcst_horizon | int | 0 | int that indicates the number of time steps removed at the end of train (and validation) |
show_plot | bool | True | flag that indicates if a plot showing the splits will be created |
= np.arange(1000) + 100
y =0.2)(y)[1], L(np.arange(800, 1000).tolist()))
test_eq(TimeSplitter(valid_size=0.2)(y)[0], TimeSplitter(valid_size=200)(y)[0])
test_eq(TimeSplitter(valid_size=0.2, show_plot=True)(y) TimeSplitter(valid_size
((#800) [0,1,2,3,4,5,6,7,8,9...],
(#200) [800,801,802,803,804,805,806,807,808,809...])
= 5
n_splits = 0.2
valid_size = 0
test_size = False # set to True for debugging (valid = train)
train_only = None
train_size = True
stratify = True
balance = True
shuffle = None
predefined_splits = True
show_plot
= True
check_splits = 23
random_state
= get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, balance=balance, stratify=stratify,
splits =train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
train_only= splits[0] if n_splits == 1 else splits[0][0]
split y[split].mean(), split
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
list([splits[0], splits[1], splits[2], splits[3], splits[4]])
[((#800) [314,194,782,789,502,917,137,415,904,181...],
(#200) [362,151,934,378,95,597,500,117,980,844...]),
((#800) [312,198,777,788,515,910,145,413,898,186...],
(#200) [352,133,955,396,64,596,442,79,991,882...]),
((#800) [311,197,783,791,507,922,145,416,908,184...],
(#200) [338,125,912,361,54,594,486,88,994,859...]),
((#800) [296,181,782,789,493,917,130,401,905,165...],
(#200) [405,199,953,444,113,610,515,137,997,881...]),
((#800) [320,190,782,788,506,906,141,412,893,178...],
(#200) [336,149,942,358,49,582,472,70,990,907...])]
= 5
n_splits = 0.
valid_size = 0.
test_size = True
shuffle = True
stratify = True
train_only = None
train_size = True
check_splits = 1
random_state = True
show_plot
= get_splits(y, n_splits=n_splits, valid_size=valid_size, test_size=test_size, shuffle=shuffle, stratify=stratify,
splits =train_only, train_size=train_size, check_splits=check_splits, random_state=random_state, show_plot=show_plot, verbose=True)
train_onlyfor split in splits:
len(split[0]), len(y))
test_eq(0]), np.arange(len(y))) test_eq(np.sort(split[
stratify set to False as n_splits=5 cannot be greater than the min number of members in each class (1).
valid == train
= 5
n_splits = np.random.randint(0, 2, 1000)
y
= get_splits(y, n_splits=n_splits, shuffle=False, check_splits=True)
splits zip(*splits))[1])), np.arange(len(y)))
test_eq(np.concatenate((L(
= get_splits(y, n_splits=n_splits, shuffle=True, check_splits=True)
splits zip(*splits))[1]))), np.arange(len(y))) test_eq(np.sort(np.concatenate((L(
= 2
n_splits = np.random.randint(0, 2, 1000)
y
= get_splits(y, n_splits=n_splits, test_size=0.2, shuffle=False)
splits for i in range(n_splits): leakage_finder(*splits[i])
len(splits), n_splits)
test_eq(len(splits[0]), 3)
test_eq(= []
s for split in splits[0]]
[s.extend(split) len(y)))
test_eq(np.sort(s), np.arange(= []
s for split in splits[1]]
[s.extend(split) len(y))) test_eq(np.sort(s), np.arange(
= np.random.randint(0, 2, 1000)
y = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
splits1 = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
splits2 = get_splits(y, valid_size=.25, test_size=0, random_state=None, stratify=True, shuffle=True)
splits3 = get_splits(y, valid_size=.25, test_size=0, random_state=None, stratify=True, shuffle=True)
splits4 0], splits2[0])
test_eq(splits1[0], splits4[0]) test_ne(splits3[
= np.random.randint(0, 2, 100)
y = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True, shuffle=True)
splits len(splits), 2) test_eq(
= np.random.randint(0, 2, 100)
y = get_splits(y, valid_size=.25, test_size=0, random_state=23, stratify=True)
splits len(splits), 2) test_eq(
= np.random.randint(0, 2, 100)
y = get_splits(y, valid_size=.25, test_size=20, random_state=23, stratify=True)
splits len(splits), 3)
test_eq(*splits) leakage_finder(
= TrainValidTestSplitter(valid_size=.25, test_size=20, random_state=23, stratify=True)(np.random.randint(0, 2, 100))
splits len(splits[1]), 25)
test_eq(len(splits[2]), 20) test_eq(
= np.random.randint(0, 2, 1000)
o for p in [1, .75, .5, .25, .125]:
= get_splits(o, train_size=p)
splits len(splits[0]), len(o) * .8 * p) test_eq(
= L([0] * 50 + [1] * 25 + [2] * 15 + [3] * 10)
y = get_splits(y, valid_size=.2, test_size=.2)
splits 0]])==np.mean(y[splits[1]])==np.mean(y[splits[2]]), True)
test_eq(np.mean(y[splits[ splits
((#60) [58,95,53,44,28,69,9,12,22,88...],
(#20) [89,71,60,4,19,37,75,13,46,30...],
(#20) [76,68,74,29,16,97,14,21,90,82...])
= L([0] * 50 + [1] * 25 + [2] * 15 + [3] * 10)
y = get_splits(y, n_splits=1, valid_size=.2, test_size=.2, shuffle=False)
splits # test_eq(splits[0] + splits[1] + splits[2], np.arange(100))
splits
((#60) [0,1,2,3,4,5,6,7,8,9...],
(#20) [60,61,62,63,64,65,66,67,68,69...],
(#20) [80,81,82,83,84,85,86,87,88,89...])
= get_splits(np.random.randint(0,5,100), valid_size=0.213, test_size=17)
splits len(splits[1]), 21)
test_eq(len(splits[2]), 17) test_eq(
= get_splits(np.random.randint(0,5,100), valid_size=0.213, test_size=17, train_size=.2)
splits splits
((#12) [37,38,62,60,16,22,95,44,94,98...],
(#21) [88,93,5,31,57,23,90,18,15,40...],
(#17) [4,86,47,33,59,52,99,48,70,3...])
combine_split_data
combine_split_data (xs, ys=None)
xs is a list with X_train, X_valid, …. ys is None or a list with y_train, y_valid, ….
get_predefined_splits
get_predefined_splits (*xs)
xs is a list with X_train, X_valid, …
get_splits_len
get_splits_len (splits)
= np.random.rand(3,3,4), np.random.randint(0,2,3), np.random.rand(2,3,4), np.random.randint(0,2,2)
X_train, y_train, X_valid, y_valid = combine_split_data([X_train, X_valid], [y_train, y_valid])
X, y, splits 0]])
test_eq(X_train, X[splits[1]])
test_eq(X_valid, X[splits[
test_type(X_train, X) test_type(y_train, y)
= np.random.rand(3,4), np.random.randint(0,2,3), np.random.rand(2,4), np.random.randint(0,2,2)
X_train, y_train, X_valid, y_valid = combine_split_data([X_train, X_valid], [y_train, y_valid])
X, y, splits None], X[splits[0]])
test_eq(X_train[:, None], X[splits[1]])
test_eq(X_valid[:,
test_type(X_train, X) test_type(y_train, y)
Forecasting
get_df_usable_idxs
get_df_usable_idxs (df, fcst_history, fcst_horizon, stride=1, unique_id_cols=None, return_np_indices=False)
Calculates the indices that can be used from a df when using a sliding window
Type | Default | Details | |
---|---|---|---|
df | dataframe containing a sorted time series | ||
fcst_history | # historical steps used as input (size of the sliding window for the input) | ||
fcst_horizon | # steps forecasted into the future (size of the sliding window for the target) | ||
stride | int | 1 | int or tuple of 2 int containing the strides of the sliding windows (input and target) |
unique_id_cols | NoneType | None | str indicating the column/s with the unique identifier/s for each entity |
return_np_indices | bool | False | bool indicating what type of indices are returned. Default to False (dataframe indices) |
get_usable_idxs
get_usable_idxs (df, fcst_history, fcst_horizon, stride=1)
calculate_fcst_stats
calculate_fcst_stats (df, fcst_history, fcst_horizon, splits, x_vars=None, y_vars=None, subset_size=None)
Calculates the training stats required in a forecasting task
Type | Default | Details | |
---|---|---|---|
df | dataframe containing a sorted time series for a single entity or subject | ||
fcst_history | # historical steps used as input. | ||
fcst_horizon | # steps forecasted into the future. | ||
splits | splits that will be used to train the model. splits[0] is the train split: | ||
x_vars | NoneType | None | features used as input |
y_vars | NoneType | None | features used as output |
subset_size | NoneType | None | int or float to determne the number of train samples used to calculate the mean and std |
get_forecasting_splits
get_forecasting_splits (df, fcst_history, fcst_horizon, stride=1, valid_size=0.0, test_size=0.2, valid_cutoff_datetime=None, test_cutoff_datetime=None, datetime_col=None, use_index=False, unique_id_cols=None, show_plot=True)
Type | Default | Details | |
---|---|---|---|
df | dataframe containing a sorted time series | ||
fcst_history | # historical steps used as input (size of the sliding window for the input) | ||
fcst_horizon | # steps forecasted into the future (size of the sliding window for the target) | ||
stride | int | 1 | int or tuple of 2 int containing the strides of the sliding windows (input and target) |
valid_size | float | 0.0 | int or float indicating the size of the training set (based on datetimes) |
test_size | float | 0.2 | int or float indicating the size of the test set (based on datetimes) |
valid_cutoff_datetime | NoneType | None | first prediction datetime of validation dataset |
test_cutoff_datetime | NoneType | None | first prediction datetime of test dataset |
datetime_col | NoneType | None | str indicating the column with the datetime values |
use_index | bool | False | flag to indicate if the datetime is in the index |
unique_id_cols | NoneType | None | str indicating the column/s with the unique identifier/s for each entity |
show_plot | bool | True | flag to indicate if splits should be plotted |
= 100
df1_len = 80
df2_len
= 'datetime'
datetime_col = pd.DataFrame(np.arange(df1_len), columns=['value'])
df1 'datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df1_len, freq='1D')
df1['type'] = 1
df1[
= df1
df
display(df)
# settings
= 10
fcst_history = 1
fcst_horizon = 1
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = False
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
# settings
= 10
fcst_history = 5
fcst_horizon = 5
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = False
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
value | datetime | type | |
---|---|---|---|
0 | 0 | 1749-03-31 | 1 |
1 | 1 | 1749-04-01 | 1 |
2 | 2 | 1749-04-02 | 1 |
3 | 3 | 1749-04-03 | 1 |
4 | 4 | 1749-04-04 | 1 |
... | ... | ... | ... |
95 | 95 | 1749-07-04 | 1 |
96 | 96 | 1749-07-05 | 1 |
97 | 97 | 1749-07-06 | 1 |
98 | 98 | 1749-07-07 | 1 |
99 | 99 | 1749-07-08 | 1 |
100 rows × 3 columns
splits size : [63, 9, 18] (90: [0.7, 0.1, 0.2])
splits size : [12, 2, 4] (18: [0.67, 0.11, 0.22])
= 100
df1_len = 80
df2_len
= 'datetime'
datetime_col = pd.DataFrame(np.arange(df1_len), columns=['value'])
df1 'datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df1_len, freq='1D')
df1['type'] = 1
df1[= df1.set_index("datetime")
df1_index
= df1_index
df
display(df)
# settings
= 10
fcst_history = 1
fcst_horizon = 1
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = True
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
# settings
= 10
fcst_history = 5
fcst_horizon = 5
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = True
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
value | type | |
---|---|---|
datetime | ||
1749-03-31 | 0 | 1 |
1749-04-01 | 1 | 1 |
1749-04-02 | 2 | 1 |
1749-04-03 | 3 | 1 |
1749-04-04 | 4 | 1 |
... | ... | ... |
1749-07-04 | 95 | 1 |
1749-07-05 | 96 | 1 |
1749-07-06 | 97 | 1 |
1749-07-07 | 98 | 1 |
1749-07-08 | 99 | 1 |
100 rows × 2 columns
splits size : [63, 9, 18] (90: [0.7, 0.1, 0.2])
splits size : [12, 2, 4] (18: [0.67, 0.11, 0.22])
= 100
df1_len = 80
df2_len
= 'datetime'
datetime_col = pd.DataFrame(np.arange(df1_len), columns=['value'])
df1 'datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df1_len, freq='1D')
df1['type'] = 1
df1[= df1.set_index("datetime")
df1_index = pd.DataFrame(np.arange(df2_len) * 10, columns=['value'])
df2 'datetime'] = pd.date_range(pd.to_datetime('1749-04-15'), periods=df2_len, freq='1D')
df2['type'] = 2
df2[= pd.concat([df1, df2]).reset_index(drop=True).reset_index(drop=True)
df_comb
= df_comb
df
display(df)
# settings
= 10
fcst_history = 3
fcst_horizon = 1
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = False
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
value | datetime | type | |
---|---|---|---|
0 | 0 | 1749-03-31 | 1 |
1 | 1 | 1749-04-01 | 1 |
2 | 2 | 1749-04-02 | 1 |
3 | 3 | 1749-04-03 | 1 |
4 | 4 | 1749-04-04 | 1 |
... | ... | ... | ... |
175 | 750 | 1749-06-29 | 2 |
176 | 760 | 1749-06-30 | 2 |
177 | 770 | 1749-07-01 | 2 |
178 | 780 | 1749-07-02 | 2 |
179 | 790 | 1749-07-03 | 2 |
180 rows × 3 columns
splits size : [101, 16, 31] (148: [0.68, 0.11, 0.21])
= 100
df1_len = 80
df2_len
= 'datetime'
datetime_col = pd.DataFrame(np.arange(df1_len), columns=['value'])
df1 'datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df1_len, freq='1D')
df1['type'] = 1
df1[= df1.set_index("datetime")
df1_index = pd.DataFrame(np.arange(df2_len) * 10, columns=['value'])
df2 'datetime'] = pd.date_range(pd.to_datetime('1749-04-15'), periods=df2_len, freq='1D')
df2['type'] = 2
df2[= pd.concat([df1, df2]).reset_index(drop=True).reset_index(drop=True)
df_comb = df_comb.set_index("datetime")
df_comb_index = None
df_comb_index.index.name
= df_comb_index
df
display(df)
# settings
= 15
fcst_history = 5
fcst_horizon = 1
stride = 'type'
unique_id_cols = 'datetime'
datetime_col = True
use_index = 0.1 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
valid_size = 0.2 # a percent (float) or a number of samples (int) - .1 means 10% of the dates
test_size = '1749-08-21' # first prediction datetime of validation dataset
valid_cutoff_datetime = '1749-12-24' # first prediction datetime of test dataset
test_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
valid_cutoff_datetime = None # datetime compatible with the datetime_col containing the starting date for the validation dataset
test_cutoff_datetime
= get_forecasting_splits(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, stride=stride,
splits =unique_id_cols, datetime_col=datetime_col, use_index=use_index,
unique_id_cols=valid_size, test_size=test_size,
valid_size=valid_cutoff_datetime, test_cutoff_datetime=test_cutoff_datetime)
valid_cutoff_datetime
print(f"splits size : {[len(s) for s in splits]} ({sum([len(s) for s in splits])}: {[round(len(s)/sum([len(s) for s in splits]), 2) for s in splits]})")
value | type | |
---|---|---|
1749-03-31 | 0 | 1 |
1749-04-01 | 1 | 1 |
1749-04-02 | 2 | 1 |
1749-04-03 | 3 | 1 |
1749-04-04 | 4 | 1 |
... | ... | ... |
1749-06-29 | 750 | 2 |
1749-06-30 | 760 | 2 |
1749-07-01 | 770 | 2 |
1749-07-02 | 780 | 2 |
1749-07-03 | 790 | 2 |
180 rows × 2 columns
splits size : [83, 14, 29] (126: [0.66, 0.11, 0.23])
get_long_term_forecasting_splits
get_long_term_forecasting_splits (df, fcst_history, fcst_horizon, dsid=None, show_plot=True)
Returns the train, valid and test splits for long-range time series datasets
Type | Default | Details | |
---|---|---|---|
df | dataframe containing a sorted time series for a single entity or subject | ||
fcst_history | # historical steps used as input. | ||
fcst_horizon | # steps forecasted into the future. | ||
dsid | NoneType | None | dataset name |
show_plot | bool | True | plot the splits |