data = np.arange(20).reshape(-1,1).repeat(3, 1) * np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['feat_1', 'feat_2', 'feat_3'])
df.head()| feat_1 | feat_2 | feat_3 | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 10 | 100 |
| 2 | 2 | 20 | 200 |
| 3 | 3 | 30 | 300 |
| 4 | 4 | 40 | 400 |
Functions required to prepare X (and y) from a pandas dataframe.
def apply_sliding_window(
data, # and array-like object with the input data
window_len:int | list, # sliding window length. When using a list, use negative numbers and 0.
horizon:int | list=0, # horizon
x_vars:int | list=None, # indices of the independent variables
y_vars:int | list=None, # indices of the dependent variables (target). [] means no y will be created. None means all variables.
):
Applies a sliding window on an array-like input to generate a 3d X (and optionally y)
Call self as a function.
Call self as a function.
| feat_1 | feat_2 | feat_3 | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 10 | 100 |
| 2 | 2 | 20 | 200 |
| 3 | 3 | 30 | 300 |
| 4 | 4 | 40 | 400 |
window_len = 8
horizon = 1
x_vars = None
y_vars = None
X, y = apply_sliding_window(data, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, data))
print(np.shares_memory(y, data))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + horizon), df.shape[1], window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + horizon), df.shape[1]))
X[0], y[0]True
True
(12, 3, 8) (12, 3)
(array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 0, 10, 20, 30, 40, 50, 60, 70],
[ 0, 100, 200, 300, 400, 500, 600, 700]]),
array([ 8, 80, 800]))
window_len = 8
horizon = 1
x_vars = None
y_vars = 0
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + horizon), df.shape[1], window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + horizon),))
X[0], y[0]True
True
(12, 3, 8) (12,)
(array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 0, 10, 20, 30, 40, 50, 60, 70],
[ 0, 100, 200, 300, 400, 500, 600, 700]]),
np.int64(8))
window_len = 8
horizon = [1, 2]
x_vars = 0
y_vars = [1, 2]
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + max(horizon)), 1, window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + max(horizon)), len(y_vars), len(horizon)))
X[0], y[0]True
False
(11, 1, 8) (11, 2, 2)
(array([[0, 1, 2, 3, 4, 5, 6, 7]]),
array([[ 80, 90],
[800, 900]]))
window_len = [-4, -2, -1, 0]
horizon = [1, 2, 4]
x_vars = 0
y_vars = [1, 2]
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (12, 1, 4))
test_eq(y.shape, (12, 2, 3))
X[0], y[0]False
False
(12, 1, 4) (12, 2, 3)
(array([[0, 2, 3, 4]]),
array([[ 50, 60, 80],
[500, 600, 800]]))
This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to create a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. steps_in_rows: flag to indicate if each step is in a different row or in a different column (default). to3d: turns X to 3d (including univariate time series) sort_by: this is used to pass any colum/s that are needed to sort the steps in the sequence. If you pass a sample_col and/ or feat_col these will be automatically used before the sort_by column/s, and you don’t need to add them to the sort_by column/s list. y_func: function used to calculate y for each sample (and target_col) return_names: flag to return the names of the columns from where X was generated
Call self as a function.
| sample_id | var1 | var2 | |
|---|---|---|---|
| 0 | 1 | 10 | 100 |
| 1 | 1 | 11 | 101 |
| 2 | 1 | 12 | 102 |
| 3 | 2 | 23 | 203 |
| 4 | 2 | 24 | 204 |
| 5 | 2 | 25 | 205 |
| 6 | 3 | 36 | 306 |
| 7 | 3 | 37 | 307 |
| 8 | 3 | 38 | 308 |
n_samples = 1_000
n_rows = 10_000
sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array([0,1,2])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array([100,200,300])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = random_choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df| sample_id | feat_id | 1 | 2 | 3 | 4 | 5 | 6 | target | target2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 974 | 9 | -1.020225 | 0.490861 | -0.189539 | -1.025940 | -1.371664 | -0.168857 | 2.0 | 2.0 |
| 1 | 876 | 0 | 0.941679 | 0.216880 | -1.062914 | 0.040360 | -0.237583 | -0.975293 | 0.0 | 0.0 |
| 2 | 307 | 5 | 2.805680 | -1.370852 | -0.135271 | -1.131725 | -2.353741 | -0.527359 | 0.0 | 0.0 |
| 3 | 374 | 6 | -1.328668 | 0.874001 | 2.293857 | 1.701095 | -1.155008 | 0.214372 | 0.0 | 0.0 |
| 4 | 44 | 3 | -1.210909 | 1.001880 | 0.671870 | 1.175491 | -1.450532 | -0.321712 | 1.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | 821 | 6 | 0.086783 | -0.605714 | 1.332561 | 0.541426 | -0.297833 | 0.722690 | 1.0 | 1.0 |
| 9996 | 284 | 3 | 0.733117 | 0.926012 | -0.196063 | 0.341643 | -0.942909 | 1.654405 | 0.0 | 0.0 |
| 9997 | 858 | 6 | -0.951218 | -1.272363 | -1.855388 | 0.243799 | 0.067996 | 1.196931 | 1.0 | 1.0 |
| 9998 | 58 | 6 | 1.467693 | -1.061652 | -0.588058 | -1.533506 | 0.107606 | 1.167752 | 0.0 | 0.0 |
| 9999 | 619 | 4 | 0.668199 | -1.728383 | -0.041866 | -1.362354 | -0.021774 | -0.332942 | 0.0 | 0.0 |
10000 rows × 10 columns
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id'], kind='stable').reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])TESTDATA = StringIO("""sample_id;value_0;value_1;target
rob;2;3;0
alice;6;7;1
eve;11;12;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y| sample_id | value_0 | value_1 | target | |
|---|---|---|---|---|
| 0 | rob | 2 | 3 | 0 |
| 1 | alice | 6 | 7 | 1 |
| 2 | eve | 11 | 12 | 2 |
(array([[[ 6, 7]],
[[11, 12]],
[[ 2, 3]]]),
array([1, 2, 0]))
# Univariate
TESTDATA = StringIO("""sample_id;timestep;values;target
rob;1;2;0
alice;1;6;1
eve;1;11;2
rob;2;3;0
alice;2;7;1
eve;2;12;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)| sample_id | timestep | values | target | |
|---|---|---|---|---|
| 0 | rob | 1 | 2 | 0 |
| 1 | alice | 1 | 6 | 1 |
| 2 | eve | 1 | 11 | 2 |
| 3 | rob | 2 | 3 | 0 |
| 4 | alice | 2 | 7 | 1 |
| 5 | eve | 2 | 12 | 2 |
[[[ 6 7]]
[[11 12]]
[[ 2 3]]] [1 2 0]
# Multivariate
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
rob;green;2;3;0
rob;yellow;3;4;0
rob;blue;4;5;0
rob;red;5;6;0
alice;green;6;7;1
alice;yellow;7;8;1
alice;blue;8;9;1
alice;red;9;10;1
eve;yellow;11;12;2
eve;green;10;11;2
eve;blue;12;12;2
eve;red;13;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
idx = random_choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))| sample_id | trait | value_0 | value_1 | target | |
|---|---|---|---|---|---|
| 11 | eve | red | 13 | 14 | 2 |
| 5 | alice | yellow | 7 | 8 | 1 |
| 7 | alice | red | 9 | 10 | 1 |
| 6 | alice | blue | 8 | 9 | 1 |
| 3 | rob | red | 5 | 6 | 0 |
| 9 | eve | green | 10 | 11 | 2 |
| 1 | rob | yellow | 3 | 4 | 0 |
| 4 | alice | green | 6 | 7 | 1 |
| 8 | eve | yellow | 11 | 12 | 2 |
| 10 | eve | blue | 12 | 12 | 2 |
| 0 | rob | green | 2 | 3 | 0 |
| 2 | rob | blue | 4 | 5 | 0 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [1 2 0]
# Multivariate, multi-label
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
rob;green;2;3;0;0
rob;yellow;3;4;0;0
rob;blue;4;5;0;0
rob;red;5;6;0;0
alice;green;6;7;1;0
alice;yellow;7;8;1;0
alice;blue;8;9;1;0
alice;red;9;10;1;0
eve;yellow;11;12;2;1
eve;green;10;11;2;1
eve;blue;12;12;2;1
eve;red;13;14;2;1
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)| sample_id | trait | value_0 | value_1 | target1 | target2 | |
|---|---|---|---|---|---|---|
| 0 | rob | green | 2 | 3 | 0 | 0 |
| 1 | rob | yellow | 3 | 4 | 0 | 0 |
| 2 | rob | blue | 4 | 5 | 0 | 0 |
| 3 | rob | red | 5 | 6 | 0 | 0 |
| 4 | alice | green | 6 | 7 | 1 | 0 |
| 5 | alice | yellow | 7 | 8 | 1 | 0 |
| 6 | alice | blue | 8 | 9 | 1 | 0 |
| 7 | alice | red | 9 | 10 | 1 | 0 |
| 8 | eve | yellow | 11 | 12 | 2 | 1 |
| 9 | eve | green | 10 | 11 | 2 | 1 |
| 10 | eve | blue | 12 | 12 | 2 | 1 |
| 11 | eve | red | 13 | 14 | 2 | 1 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [[1 0]
[2 1]
[0 0]]
# Multivariate, unlabeled
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
rob;green;2;3;0
rob;yellow;3;4;0
rob;blue;4;5;0
rob;red;5;6;0
alice;green;6;7;1
alice;yellow;7;8;1
alice;blue;8;9;1
alice;red;9;10;1
eve;yellow;11;12;2
eve;green;10;11;2
eve;blue;12;12;2
eve;red;13;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
idx = random_choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)| sample_id | trait | value_0 | value_1 | target | |
|---|---|---|---|---|---|
| 5 | alice | yellow | 7 | 8 | 1 |
| 6 | alice | blue | 8 | 9 | 1 |
| 11 | eve | red | 13 | 14 | 2 |
| 2 | rob | blue | 4 | 5 | 0 |
| 7 | alice | red | 9 | 10 | 1 |
| 8 | eve | yellow | 11 | 12 | 2 |
| 0 | rob | green | 2 | 3 | 0 |
| 3 | rob | red | 5 | 6 | 0 |
| 9 | eve | green | 10 | 11 | 2 |
| 4 | alice | green | 6 | 7 | 1 |
| 1 | rob | yellow | 3 | 4 | 0 |
| 10 | eve | blue | 12 | 12 | 2 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] None
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
rob;green;1;2;0
rob;yellow;1;3;0
rob;blue;1;4;0
rob;red;1;5;0
alice;green;1;6;1
alice;yellow;1;7;1
alice;blue;1;8;1
alice;red;1;9;1
eve;yellow;1;11;2
eve;green;1;10;2
eve;blue;1;12;2
eve;red;1;13;2
rob;green;2;3;0
rob;yellow;2;4;0
rob;blue;2;5;0
rob;red;2;6;0
alice;green;2;7;1
alice;yellow;2;8;1
alice;blue;2;9;1
alice;red;2;10;1
eve;yellow;2;12;2
eve;green;2;11;2
eve;blue;2;13;2
eve;red;2;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))| sample_id | trait | timestep | values | target | |
|---|---|---|---|---|---|
| 0 | rob | green | 1 | 2 | 0 |
| 1 | rob | yellow | 1 | 3 | 0 |
| 2 | rob | blue | 1 | 4 | 0 |
| 3 | rob | red | 1 | 5 | 0 |
| 4 | alice | green | 1 | 6 | 1 |
| 5 | alice | yellow | 1 | 7 | 1 |
| 6 | alice | blue | 1 | 8 | 1 |
| 7 | alice | red | 1 | 9 | 1 |
| 8 | eve | yellow | 1 | 11 | 2 |
| 9 | eve | green | 1 | 10 | 2 |
| 10 | eve | blue | 1 | 12 | 2 |
| 11 | eve | red | 1 | 13 | 2 |
| 12 | rob | green | 2 | 3 | 0 |
| 13 | rob | yellow | 2 | 4 | 0 |
| 14 | rob | blue | 2 | 5 | 0 |
| 15 | rob | red | 2 | 6 | 0 |
| 16 | alice | green | 2 | 7 | 1 |
| 17 | alice | yellow | 2 | 8 | 1 |
| 18 | alice | blue | 2 | 9 | 1 |
| 19 | alice | red | 2 | 10 | 1 |
| 20 | eve | yellow | 2 | 12 | 2 |
| 21 | eve | green | 2 | 11 | 2 |
| 22 | eve | blue | 2 | 13 | 2 |
| 23 | eve | red | 2 | 14 | 2 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 13]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [1 2 0]
Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray
Call self as a function.
| A | B | missing_A | missing_B | |
|---|---|---|---|---|
| 0 | -0.221109 | -0.252389 | 0.0 | 0.0 |
| 1 | -1.417620 | 0.100183 | 0.0 | 0.0 |
| 2 | 0.295520 | NaN | 0.0 | 1.0 |
| 3 | -0.461779 | -1.010740 | 0.0 | 0.0 |
| 4 | NaN | 0.440123 | 1.0 | 0.0 |
| 5 | -0.324770 | -0.407949 | 0.0 | 0.0 |
| 6 | 0.006966 | -0.379294 | 0.0 | 0.0 |
| 7 | -0.576478 | 0.550953 | 0.0 | 0.0 |
| 8 | 0.612107 | -0.095469 | 0.0 | 0.0 |
| 9 | -0.317881 | NaN | 0.0 | 1.0 |
def add_missing_timestamps(
df, # pandas DataFrame
datetime_col:NoneType=None, # column that contains the datetime data (without duplicates within groups)
use_index:bool=False, # indicates if the index contains the datetime data
unique_id_cols:NoneType=None, # column used to identify unique_ids
groupby:NoneType=None, # same as unique_id_cols. Will be deprecated. Kept for compatiblity.
fill_value:float=nan, # values that will be insert where missing dates exist. Default:np.nan
range_by_group:bool=True, # if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df.
start_date:NoneType=None, # start date to fill in missing dates (same for all unique_ids)
end_date:NoneType=None, # end date to fill in missing dates (same for all unique_ids)
freq:NoneType=None, # frequency used to fill in the missing datetime
):
Call self as a function.
# Filling dates between min and max dates
dates = pd.date_range('2021-05-01', '2021-05-07')
date_df = pd.DataFrame({'date': dates, 'feature1': np.random.rand(len(dates)), 'feature2':
np.random.rand(len(dates))})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates| date | feature1 | feature2 | |
|---|---|---|---|
| 0 | 2021-05-01 | 0.835039 | 0.308269 |
| 1 | 2021-05-03 | 0.898825 | 0.190682 |
| 2 | 2021-05-05 | 0.063644 | 0.525019 |
| 3 | 2021-05-06 | 0.484979 | 0.994566 |
| 4 | 2021-05-07 | 0.104824 | 0.831195 |
# No groups
# Filling dates between min and max dates
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols=None,
fill_value=np.nan,
range_by_group=False)
test_eq(output_df, expected_output_df)| date | feature1 | feature2 | |
|---|---|---|---|
| 0 | 2021-05-01 | 0.835039 | 0.308269 |
| 1 | 2021-05-02 | NaN | NaN |
| 2 | 2021-05-03 | 0.898825 | 0.190682 |
| 3 | 2021-05-04 | NaN | NaN |
| 4 | 2021-05-05 | 0.063644 | 0.525019 |
| 5 | 2021-05-06 | 0.484979 | 0.994566 |
| 6 | 2021-05-07 | 0.104824 | 0.831195 |
# Filling dates between min and max dates for each value in groupby column
dates = pd.date_range('2021-05-01', '2021-05-07')
dates = dates.append(dates)
date_df = pd.DataFrame({
'date': dates,
'id': np.array([0]*(len(dates)//2)+[1]*(len(dates)//2)),
'feature1': np.random.rand(len(dates)),
'feature2': np.random.rand(len(dates)),
}).astype({'id': int})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)
date_df_with_missing_dates| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-03 | 0 | 0.175720 | 0.142308 |
| 1 | 2021-05-05 | 0 | 0.986945 | 0.109165 |
| 2 | 2021-05-06 | 0 | 0.487086 | 0.327909 |
| 3 | 2021-05-07 | 0 | 0.008540 | 0.305262 |
| 4 | 2021-05-01 | 1 | 0.820257 | 0.145753 |
| 5 | 2021-05-03 | 1 | 0.508623 | 0.048391 |
| 6 | 2021-05-04 | 1 | 0.034002 | 0.029274 |
| 7 | 2021-05-06 | 1 | 0.412848 | 0.700355 |
# groupby='id', range_by_group=True
expected_output_df = date_df.drop([0,1,13]).reset_index(drop=True)
expected_output_df.loc[[1,6,9], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols='id',
fill_value=np.nan,
range_by_group=True)
test_eq(expected_output_df, output_df)| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-03 | 0 | 0.175720 | 0.142308 |
| 1 | 2021-05-04 | 0 | NaN | NaN |
| 2 | 2021-05-05 | 0 | 0.986945 | 0.109165 |
| 3 | 2021-05-06 | 0 | 0.487086 | 0.327909 |
| 4 | 2021-05-07 | 0 | 0.008540 | 0.305262 |
| 5 | 2021-05-01 | 1 | 0.820257 | 0.145753 |
| 6 | 2021-05-02 | 1 | NaN | NaN |
| 7 | 2021-05-03 | 1 | 0.508623 | 0.048391 |
| 8 | 2021-05-04 | 1 | 0.034002 | 0.029274 |
| 9 | 2021-05-05 | 1 | NaN | NaN |
| 10 | 2021-05-06 | 1 | 0.412848 | 0.700355 |
# groupby='id', range_by_group=False
expected_output_df = date_df.copy()
expected_output_df.loc[[0,1,3,8,11,13], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols='id',
fill_value=np.nan,
range_by_group=False)
test_eq(expected_output_df, output_df)| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 | 0 | NaN | NaN |
| 1 | 2021-05-02 | 0 | NaN | NaN |
| 2 | 2021-05-03 | 0 | 0.175720 | 0.142308 |
| 3 | 2021-05-04 | 0 | NaN | NaN |
| 4 | 2021-05-05 | 0 | 0.986945 | 0.109165 |
| 5 | 2021-05-06 | 0 | 0.487086 | 0.327909 |
| 6 | 2021-05-07 | 0 | 0.008540 | 0.305262 |
| 7 | 2021-05-01 | 1 | 0.820257 | 0.145753 |
| 8 | 2021-05-02 | 1 | NaN | NaN |
| 9 | 2021-05-03 | 1 | 0.508623 | 0.048391 |
| 10 | 2021-05-04 | 1 | 0.034002 | 0.029274 |
| 11 | 2021-05-05 | 1 | NaN | NaN |
| 12 | 2021-05-06 | 1 | 0.412848 | 0.700355 |
| 13 | 2021-05-07 | 1 | NaN | NaN |
# Filling dates between min and max timestamps
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4h')
date_df = pd.DataFrame({'date': dates, 'feature1': np.random.rand(len(dates)), 'feature2':np.random.rand(len(dates))})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates| date | feature1 | feature2 | |
|---|---|---|---|
| 0 | 2021-05-01 00:00:00 | 0.844643 | 0.700726 |
| 1 | 2021-05-01 08:00:00 | 0.296392 | 0.254600 |
| 2 | 2021-05-01 16:00:00 | 0.081671 | 0.856155 |
| 3 | 2021-05-01 20:00:00 | 0.950812 | 0.522507 |
# No groups
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(), 'date', groupby=None, fill_value=np.nan, range_by_group=False, freq='4h')
test_eq(output_df, expected_output_df)| date | feature1 | feature2 | |
|---|---|---|---|
| 0 | 2021-05-01 00:00:00 | 0.844643 | 0.700726 |
| 1 | 2021-05-01 04:00:00 | NaN | NaN |
| 2 | 2021-05-01 08:00:00 | 0.296392 | 0.254600 |
| 3 | 2021-05-01 12:00:00 | NaN | NaN |
| 4 | 2021-05-01 16:00:00 | 0.081671 | 0.856155 |
| 5 | 2021-05-01 20:00:00 | 0.950812 | 0.522507 |
# Filling missing values between min and max timestamps for each value in groupby column
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4h')
dates = dates.append(dates)
date_df = pd.DataFrame({
'date': dates,
'id': np.array([0]*(len(dates)//2)+[1]*(len(dates)//2)),
'feature1': np.random.rand(len(dates)),
'feature2': np.random.rand(len(dates)),
}).astype({'id': int})
date_df_with_missing_dates = date_df.drop([0,1,3,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 08:00:00 | 0 | 0.983029 | 0.738605 |
| 1 | 2021-05-01 16:00:00 | 0 | 0.868481 | 0.418613 |
| 2 | 2021-05-01 20:00:00 | 0 | 0.891880 | 0.179105 |
| 3 | 2021-05-01 00:00:00 | 1 | 0.063692 | 0.589699 |
| 4 | 2021-05-01 04:00:00 | 1 | 0.094046 | 0.569908 |
| 5 | 2021-05-01 16:00:00 | 1 | 0.945306 | 0.471962 |
# groupby='id', range_by_group=True
expected_output_df = date_df.drop([0,1,11]).reset_index(drop=True)
expected_output_df.loc[[1,6,7], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
groupby='id',
fill_value=np.nan,
range_by_group=True,
freq='4h')
test_eq(expected_output_df, output_df)| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 08:00:00 | 0 | 0.983029 | 0.738605 |
| 1 | 2021-05-01 12:00:00 | 0 | NaN | NaN |
| 2 | 2021-05-01 16:00:00 | 0 | 0.868481 | 0.418613 |
| 3 | 2021-05-01 20:00:00 | 0 | 0.891880 | 0.179105 |
| 4 | 2021-05-01 00:00:00 | 1 | 0.063692 | 0.589699 |
| 5 | 2021-05-01 04:00:00 | 1 | 0.094046 | 0.569908 |
| 6 | 2021-05-01 08:00:00 | 1 | NaN | NaN |
| 7 | 2021-05-01 12:00:00 | 1 | NaN | NaN |
| 8 | 2021-05-01 16:00:00 | 1 | 0.945306 | 0.471962 |
# groupby='id', range_by_group=False
expected_output_df = date_df.copy()
expected_output_df.loc[[0,1,3,8,9,11], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
groupby='id',
fill_value=np.nan,
range_by_group=False,
freq='4h')
test_eq(expected_output_df, output_df)| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 00:00:00 | 0 | NaN | NaN |
| 1 | 2021-05-01 04:00:00 | 0 | NaN | NaN |
| 2 | 2021-05-01 08:00:00 | 0 | 0.983029 | 0.738605 |
| 3 | 2021-05-01 12:00:00 | 0 | NaN | NaN |
| 4 | 2021-05-01 16:00:00 | 0 | 0.868481 | 0.418613 |
| 5 | 2021-05-01 20:00:00 | 0 | 0.891880 | 0.179105 |
| 6 | 2021-05-01 00:00:00 | 1 | 0.063692 | 0.589699 |
| 7 | 2021-05-01 04:00:00 | 1 | 0.094046 | 0.569908 |
| 8 | 2021-05-01 08:00:00 | 1 | NaN | NaN |
| 9 | 2021-05-01 12:00:00 | 1 | NaN | NaN |
| 10 | 2021-05-01 16:00:00 | 1 | 0.945306 | 0.471962 |
| 11 | 2021-05-01 20:00:00 | 1 | NaN | NaN |
# No groups, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4h').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby=None, fill_value=np.nan, range_by_group=False, freq='4h'), )| date | feature1 | feature2 | |
|---|---|---|---|
| 0 | 2021-05-01 00:00:00 | 0.806107 | 0.668709 |
| 1 | 2021-05-01 08:00:00 | 0.780398 | 0.739771 |
| 2 | 2021-05-01 16:00:00 | 0.923216 | 0.551902 |
| 3 | 2021-05-01 16:00:00 | 0.850076 | 0.905751 |
# groupby='id', range_by_group=True, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4h').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=True, freq='4h'),
contains='cannot handle a non-unique multi-index!')| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 08:00:00 | 0 | 0.993108 | 0.309150 |
| 1 | 2021-05-01 12:00:00 | 0 | 0.327491 | 0.164923 |
| 2 | 2021-05-01 16:00:00 | 0 | 0.170994 | 0.851456 |
| 3 | 2021-05-01 16:00:00 | 0 | 0.454634 | 0.032915 |
| 4 | 2021-05-01 00:00:00 | 1 | 0.171314 | 0.944603 |
| 5 | 2021-05-01 04:00:00 | 1 | 0.595972 | 0.958672 |
| 6 | 2021-05-01 16:00:00 | 1 | 0.824532 | 0.904686 |
# groupby='id', range_by_group=FALSE, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4h').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=False, freq='4h'),
contains='cannot handle a non-unique multi-index!')| date | id | feature1 | feature2 | |
|---|---|---|---|---|
| 0 | 2021-05-01 08:00:00 | 0 | 0.855172 | 0.310980 |
| 1 | 2021-05-01 12:00:00 | 0 | 0.850838 | 0.310164 |
| 2 | 2021-05-01 16:00:00 | 0 | 0.361648 | 0.215211 |
| 3 | 2021-05-01 16:00:00 | 0 | 0.193510 | 0.019835 |
| 4 | 2021-05-01 00:00:00 | 1 | 0.554176 | 0.326982 |
| 5 | 2021-05-01 04:00:00 | 1 | 0.091873 | 0.590022 |
| 6 | 2021-05-01 16:00:00 | 1 | 0.349898 | 0.587218 |
Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays
Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year

















| date | id | feature1 | feature2 | dow_sin | dow_cos | |
|---|---|---|---|---|---|---|
| 0 | 2021-05-01 00:00:00 | 0 | 0.805289 | 0.527839 | -0.974928 | -0.222521 |
| 1 | 2021-05-01 04:00:00 | 0 | 0.894198 | 0.390309 | -0.974928 | -0.222521 |
| 2 | 2021-05-01 08:00:00 | 0 | 0.855172 | 0.310980 | -0.974928 | -0.222521 |
| 3 | 2021-05-01 12:00:00 | 0 | 0.850838 | 0.310164 | -0.974928 | -0.222521 |
| 4 | 2021-05-01 16:00:00 | 0 | 0.361648 | 0.215211 | -0.974928 | -0.222521 |
| 5 | 2021-05-01 20:00:00 | 0 | 0.193510 | 0.019835 | -0.974928 | -0.222521 |
| 6 | 2021-05-01 00:00:00 | 1 | 0.554176 | 0.326982 | -0.974928 | -0.222521 |
| 7 | 2021-05-01 04:00:00 | 1 | 0.091873 | 0.590022 | -0.974928 | -0.222521 |
| 8 | 2021-05-01 08:00:00 | 1 | 0.889303 | 0.811452 | -0.974928 | -0.222521 |
| 9 | 2021-05-01 12:00:00 | 1 | 0.108772 | 0.656533 | -0.974928 | -0.222521 |
| 10 | 2021-05-01 16:00:00 | 1 | 0.349898 | 0.587218 | -0.974928 | -0.222521 |
| 11 | 2021-05-01 20:00:00 | 1 | 0.065970 | 0.115706 | -0.974928 | -0.222521 |
Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors
Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors
Number of sequence steps to next real value along the last dimension of 3D arrays or tensors
Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)Call self as a function.
# Add delta timestamp features for the no groups setting
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 2))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
cols = ['date', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float})
date_df.loc[[1,3,4],'feature1'] = np.nan
date_df| date | feature1 | |
|---|---|---|
| 0 | 2021-05-01 | 0.952453 |
| 1 | 2021-05-02 | NaN |
| 2 | 2021-05-03 | 0.304684 |
| 3 | 2021-05-04 | NaN |
| 4 | 2021-05-05 | NaN |
| 5 | 2021-05-06 | 0.260937 |
| 6 | 2021-05-07 | 0.542962 |
# No groups
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1])
display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', normalize=False)
test_eq(expected_output_df, output_df)| date | feature1 | feature1_dt_fwd | feature1_dt_bwd | feature1_dt_nearest | |
|---|---|---|---|---|---|
| 0 | 2021-05-01 | 0.952453 | 1 | 2 | 1 |
| 1 | 2021-05-02 | NaN | 1 | 1 | 1 |
| 2 | 2021-05-03 | 0.304684 | 2 | 3 | 2 |
| 3 | 2021-05-04 | NaN | 1 | 2 | 1 |
| 4 | 2021-05-05 | NaN | 2 | 1 | 1 |
| 5 | 2021-05-06 | 0.260937 | 3 | 1 | 1 |
| 6 | 2021-05-07 | 0.542962 | 1 | 1 | 1 |
# Add delta timestamp features within a group
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float})
date_df.loc[[1,3,4,8,9,11],'feature1'] = np.nan
date_df| date | id | feature1 | |
|---|---|---|---|
| 0 | 2021-05-01 | 0 | 0.750825 |
| 1 | 2021-05-02 | 0 | NaN |
| 2 | 2021-05-03 | 0 | 0.997844 |
| 3 | 2021-05-04 | 0 | NaN |
| 4 | 2021-05-05 | 0 | NaN |
| 5 | 2021-05-06 | 0 | 0.123967 |
| 6 | 2021-05-07 | 0 | 0.809573 |
| 7 | 2021-05-01 | 1 | 0.745672 |
| 8 | 2021-05-02 | 1 | NaN |
| 9 | 2021-05-03 | 1 | NaN |
| 10 | 2021-05-04 | 1 | 0.187990 |
| 11 | 2021-05-05 | 1 | NaN |
| 12 | 2021-05-06 | 1 | 0.569132 |
| 13 | 2021-05-07 | 1 | 0.977659 |
# groupby='id'
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1,1,1,2,3,1,2,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1,3,2,1,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1,1,1,1,2,1,1,1])
display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', groupby='id', normalize=False)
test_eq(expected_output_df, output_df)| date | id | feature1 | feature1_dt_fwd | feature1_dt_bwd | feature1_dt_nearest | |
|---|---|---|---|---|---|---|
| 0 | 2021-05-01 | 0 | 0.750825 | 1 | 2 | 1 |
| 1 | 2021-05-02 | 0 | NaN | 1 | 1 | 1 |
| 2 | 2021-05-03 | 0 | 0.997844 | 2 | 3 | 2 |
| 3 | 2021-05-04 | 0 | NaN | 1 | 2 | 1 |
| 4 | 2021-05-05 | 0 | NaN | 2 | 1 | 1 |
| 5 | 2021-05-06 | 0 | 0.123967 | 3 | 1 | 1 |
| 6 | 2021-05-07 | 0 | 0.809573 | 1 | 1 | 1 |
| 7 | 2021-05-01 | 1 | 0.745672 | 1 | 3 | 1 |
| 8 | 2021-05-02 | 1 | NaN | 1 | 2 | 1 |
| 9 | 2021-05-03 | 1 | NaN | 2 | 1 | 1 |
| 10 | 2021-05-04 | 1 | 0.187990 | 3 | 2 | 2 |
| 11 | 2021-05-05 | 1 | NaN | 1 | 1 | 1 |
| 12 | 2021-05-06 | 1 | 0.569132 | 2 | 1 | 1 |
| 13 | 2021-05-07 | 1 | 0.977659 | 1 | 1 | 1 |
SlidingWindow and SlidingWindowPanel are 2 useful functions that will allow you to create an array with segments of a pandas dataframe based on multiple criteria.
def SlidingWindow(
window_len:int, # length of lookback window
stride:Union[None, int]=1, # n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap)
start:int=0, # determines the step where the first window is applied: 0 (default) or a given step (int). Previous steps will be discarded.
pad_remainder:bool=False, # allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data).
padding:str='post', # 'pre' or 'post' (optional, defaults to 'pre'): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence ('pre' from the end, and 'post' from the beginning)
padding_value:float=nan, # value (float) that will be used for padding. Default: np.nan
add_padding_feature:bool=True, # add an additional feature indicating whether each timestep is padded (1) or not (0).
get_x:Union[None, int, list]=None, # indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y:Union[None, int, list]=None, # indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data).
y_func:Optional[callable]=None, # optional function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
output_processor:Optional[callable]=None, # optional function to process the final output (X (and y if available)). This is useful when some values need to be removed.The function should take X and y (even if it's None) as arguments.
copy:bool=False, # copy the original object to avoid changes in it.
horizon:Union[int, list]=1, # number of future datapoints to predict (y). If get_y is [] horizon will be set to 0.
seq_first:bool=True, # True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by:Optional[list]=None, # column/s used for sorting the array in ascending order
ascending:bool=True, # used in sorting
check_leakage:bool=True, # checks if there's leakage in the output between X and y
):
Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame)
Input:
You can use np.ndarray, pd.DataFrame or torch.Tensor as input
shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)
input shape: (13, 3)
array([[[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., 3., 4.],
[ 0., 0., 0., 0., 0.]],
[[ 5., 6., 7., 8., 9.],
[ 5., 6., 7., 8., 9.],
[ 5., 6., 7., 8., 9.],
[ 0., 0., 0., 0., 0.]],
[[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[ 0., 0., 0., 1., 1.]]])
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), np.int64(5)), (array([[1, 2, 3, 4, 5]]), np.int64(6)), (array([[2, 3, 4, 5, 6]]), np.int64(7)), (array([[3, 4, 5, 6, 7]]), np.int64(8)), (array([[4, 5, 6, 7, 8]]), np.int64(9))]
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), np.int64(5)), (array([[3, 4, 5, 6, 7]]), np.int64(8))]
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), np.int64(8)), (array([[ 8, 9, 10, 11, 12]]), np.int64(13)), (array([[13, 14, 15, 16, 17]]), np.int64(18))]
input shape: (20,)
| var | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
| 5 | 5 |
| 6 | 6 |
| 7 | 7 |
| 8 | 8 |
| 9 | 9 |
| 10 | 10 |
| 11 | 11 |
| 12 | 12 |
| 13 | 13 |
| 14 | 14 |
| 15 | 15 |
| 16 | 16 |
| 17 | 17 |
| 18 | 18 |
| 19 | 19 |
[(array([[0, 1, 2, 3, 4]]), np.int64(5)), (array([[5, 6, 7, 8, 9]]), np.int64(10)), (array([[10, 11, 12, 13, 14]]), np.int64(15))]
input shape: (20,)
| var | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
| 5 | 5 |
| 6 | 6 |
| 7 | 7 |
| 8 | 8 |
| 9 | 9 |
| 10 | 10 |
| 11 | 11 |
| 12 | 12 |
| 13 | 13 |
| 14 | 14 |
| 15 | 15 |
| 16 | 16 |
| 17 | 17 |
| 18 | 18 |
| 19 | 19 |
[(array([[0, 1, 2, 3, 4]]), np.int64(5)), (array([[1, 2, 3, 4, 5]]), np.int64(6)), (array([[2, 3, 4, 5, 6]]), np.int64(7)), (array([[3, 4, 5, 6, 7]]), np.int64(8)), (array([[4, 5, 6, 7, 8]]), np.int64(9)), (array([[5, 6, 7, 8, 9]]), np.int64(10)), (array([[ 6, 7, 8, 9, 10]]), np.int64(11)), (array([[ 7, 8, 9, 10, 11]]), np.int64(12)), (array([[ 8, 9, 10, 11, 12]]), np.int64(13)), (array([[ 9, 10, 11, 12, 13]]), np.int64(14)), (array([[10, 11, 12, 13, 14]]), np.int64(15)), (array([[11, 12, 13, 14, 15]]), np.int64(16)), (array([[12, 13, 14, 15, 16]]), np.int64(17)), (array([[13, 14, 15, 16, 17]]), np.int64(18)), (array([[14, 15, 16, 17, 18]]), np.int64(19))]
input shape: (20,)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| var | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
[(array([[0, 1, 2, 3, 4]]), np.int64(5)), (array([[5, 6, 7, 8, 9]]), np.int64(10)), (array([[10, 11, 12, 13, 14]]), np.int64(15))]
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))input shape: torch.Size([10, 3])
| var_0 | var_1 | var_2 | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 10 | 100 |
| 2 | 2 | 20 | 200 |
| 3 | 3 | 30 | 300 |
| 4 | 4 | 40 | 400 |
| 5 | 5 | 50 | 500 |
| 6 | 6 | 60 | 600 |
| 7 | 7 | 70 | 700 |
| 8 | 8 | 80 | 800 |
| 9 | 9 | 90 | 900 |
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40],
[ 0, 100, 200, 300, 400]]), array([ 5, 50, 500])), (array([[ 1, 2, 3, 4, 5],
[ 10, 20, 30, 40, 50],
[100, 200, 300, 400, 500]]), array([ 6, 60, 600])), (array([[ 2, 3, 4, 5, 6],
[ 20, 30, 40, 50, 60],
[200, 300, 400, 500, 600]]), array([ 7, 70, 700])), (array([[ 3, 4, 5, 6, 7],
[ 30, 40, 50, 60, 70],
[300, 400, 500, 600, 700]]), array([ 8, 80, 800])), (array([[ 4, 5, 6, 7, 8],
[ 40, 50, 60, 70, 80],
[400, 500, 600, 700, 800]]), array([ 9, 90, 900]))]
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y="var_0")(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))input shape: torch.Size([10, 3])
| var_0 | var_1 | var_2 | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 10 | 100 |
| 2 | 2 | 20 | 200 |
| 3 | 3 | 30 | 300 |
| 4 | 4 | 40 | 400 |
| 5 | 5 | 50 | 500 |
| 6 | 6 | 60 | 600 |
| 7 | 7 | 70 | 700 |
| 8 | 8 | 80 | 800 |
| 9 | 9 | 90 | 900 |
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40],
[ 0, 100, 200, 300, 400]]), np.int64(5)), (array([[ 1, 2, 3, 4, 5],
[ 10, 20, 30, 40, 50],
[100, 200, 300, 400, 500]]), np.int64(6)), (array([[ 2, 3, 4, 5, 6],
[ 20, 30, 40, 50, 60],
[200, 300, 400, 500, 600]]), np.int64(7)), (array([[ 3, 4, 5, 6, 7],
[ 30, 40, 50, 60, 70],
[300, 400, 500, 600, 700]]), np.int64(8)), (array([[ 4, 5, 6, 7, 8],
[ 40, 50, 60, 70, 80],
[400, 500, 600, 700, 800]]), np.int64(9))]
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())input shape: torch.Size([10, 3])
| var_0 | var_1 | target | |
|---|---|---|---|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 10 | 100 |
| 2 | 2 | 20 | 200 |
| 3 | 3 | 30 | 300 |
| 4 | 4 | 40 | 400 |
| 5 | 5 | 50 | 500 |
| 6 | 6 | 60 | 600 |
| 7 | 7 | 70 | 700 |
| 8 | 8 | 80 | 800 |
| 9 | 9 | 90 | 900 |
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40]]), np.int64(500)), (array([[ 1, 2, 3, 4, 5],
[10, 20, 30, 40, 50]]), np.int64(600)), (array([[ 2, 3, 4, 5, 6],
[20, 30, 40, 50, 60]]), np.int64(700)), (array([[ 3, 4, 5, 6, 7],
[30, 40, 50, 60, 70]]), np.int64(800)), (array([[ 4, 5, 6, 7, 8],
[40, 50, 60, 70, 80]]), np.int64(900))]
(1000, 3)

(200, 2, 5) (200,)
wl = 5
n_vars = 3
t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)(100, 3)
| var_0 | var_1 | target | |
|---|---|---|---|
| 0 | -0.201682 | 0.271773 | 0.291233 |
| 1 | -0.381358 | 0.432786 | 0.510767 |
| 2 | -0.650896 | 0.092048 | 0.169614 |
| 3 | -0.999445 | 0.099044 | 0.602287 |
| 4 | -0.639507 | 0.031952 | 0.890449 |
| ... | ... | ... | ... |
| 95 | -3.714142 | -0.574150 | 3.175535 |
| 96 | -3.918679 | -0.790164 | 2.748960 |
| 97 | -3.606800 | -1.181229 | 2.476988 |
| 98 | -3.249810 | -0.713153 | 2.029528 |
| 99 | -3.440358 | -0.920634 | 2.496126 |
100 rows × 3 columns
(96, 2, 5) (96,)
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)(100, 5)
| var_0 | var_1 | var_2 | var_3 | target | |
|---|---|---|---|---|---|
| 0 | 0.197380 | -0.430499 | 0.240722 | -0.145831 | -0.019374 |
| 1 | 0.283265 | -0.723347 | -0.070873 | -0.555459 | -0.240736 |
| 2 | 0.660410 | -0.528758 | 0.427163 | -0.238760 | 0.114170 |
| 3 | 0.797657 | -0.546279 | 0.528505 | -0.105944 | 0.101699 |
| 4 | 1.044382 | -0.662189 | 0.502906 | -0.205311 | 0.029346 |
| ... | ... | ... | ... | ... | ... |
| 95 | -4.779892 | -1.745513 | -1.638753 | 4.385228 | -5.031841 |
| 96 | -5.119851 | -2.122463 | -1.260551 | 3.968454 | -5.082492 |
| 97 | -5.425792 | -2.100930 | -0.952559 | 4.388861 | -4.830615 |
| 98 | -5.034651 | -1.747375 | -0.764498 | 4.464461 | -4.341832 |
| 99 | -5.144296 | -1.689885 | -0.282443 | 4.222645 | -4.038251 |
100 rows × 5 columns
(96, 4, 5) (96,)
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)(100, 5)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| var_0 | 0.470935 | 0.705640 | 0.558800 | 0.110090 | 0.445475 | 0.553075 | 0.357661 | 0.541565 | 0.588213 | 0.454781 | ... | 2.991845 | 2.840564 | 2.969434 | 2.637321 | 3.004290 | 3.217118 | 3.057600 | 3.216079 | 3.131275 | 3.267005 |
| var_1 | 0.134900 | 0.320551 | 0.164657 | 0.562128 | 0.896223 | 0.579004 | 0.244454 | 0.477033 | 0.753045 | 0.318959 | ... | 1.448812 | 1.094919 | 1.403493 | 1.554838 | 1.286745 | 1.534035 | 1.551969 | 1.188953 | 1.515799 | 1.969358 |
| var_2 | -0.190892 | -0.516197 | -0.618403 | -0.420175 | -0.183219 | -0.260192 | -0.300966 | 0.110512 | 0.255440 | -0.112471 | ... | -2.546015 | -2.247514 | -2.016816 | -2.251076 | -2.317142 | -2.583731 | -2.925544 | -2.968128 | -3.203843 | -3.367078 |
| var_3 | 0.137190 | 0.040264 | -0.441738 | -0.265404 | -0.065734 | 0.170627 | 0.122840 | -0.045392 | -0.181513 | 0.272935 | ... | 1.769042 | 1.624776 | 1.517051 | 1.826321 | 1.512124 | 1.401662 | 1.279692 | 1.483385 | 1.112587 | 1.240599 |
| target | -0.045081 | 0.058416 | 0.493379 | 0.273270 | 0.701173 | 0.408625 | 0.731390 | 0.309546 | 0.136900 | -0.098329 | ... | -0.851239 | -0.967925 | -1.273264 | -1.298938 | -1.644132 | -2.041759 | -1.990985 | -1.638094 | -1.913109 | -1.891388 |
5 rows × 100 columns
(96, 4, 5) (96,)
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)(100, 5)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| var_0 | 0.475880 | 0.475429 | 0.290398 | 0.387992 | 0.628579 | 0.788049 | 1.066708 | 0.973698 | 0.935035 | 1.434445 | ... | -4.029041 | -3.736176 | -3.459172 | -3.375216 | -3.407535 | -2.981360 | -3.263693 | -3.089960 | -2.599946 | -3.045968 |
| var_1 | -0.439460 | -0.929588 | -0.943197 | -0.894799 | -0.917105 | -0.692846 | -0.829524 | -1.131488 | -0.931603 | -1.186286 | ... | -2.865602 | -2.894562 | -2.745993 | -2.931338 | -2.435319 | -2.706789 | -2.642565 | -2.497788 | -2.868398 | -2.461368 |
| var_2 | 0.130447 | 0.215466 | 0.486535 | 0.893396 | 1.215974 | 1.439646 | 1.888241 | 1.770271 | 1.364136 | 1.200638 | ... | -0.422184 | -0.274803 | -0.760513 | -0.806473 | -0.545518 | -0.281999 | -0.141130 | -0.108504 | -0.395920 | -0.768391 |
| var_3 | 0.436283 | 0.216766 | 0.243471 | -0.172380 | 0.181806 | 0.058702 | 0.336831 | 0.723804 | 1.165371 | 1.035775 | ... | 3.979013 | 3.596084 | 3.235822 | 3.166613 | 2.667026 | 2.591594 | 2.738546 | 2.997448 | 3.404949 | 3.229587 |
| target | 0.127113 | -0.107228 | 0.241138 | 0.674576 | 0.522773 | 0.489965 | 0.261877 | 0.705663 | 0.417860 | 0.409656 | ... | -0.599095 | -0.588000 | -0.457287 | -0.517235 | -0.216249 | 0.246780 | 0.587858 | 0.384807 | 0.678492 | 0.500024 |
5 rows × 100 columns
(20, 4, 5) (20,)
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits(100, 5)
| var_0 | var_1 | var_2 | var_3 | target | |
|---|---|---|---|---|---|
| 0 | -0.105450 | -0.322755 | -0.410378 | -0.211661 | -0.089199 |
| 1 | 0.198704 | -0.487194 | -0.510372 | -0.569510 | -0.215464 |
| 2 | -0.069287 | -0.127578 | -0.907798 | -0.905169 | -0.185565 |
| 3 | -0.060425 | -0.293397 | -0.811032 | -1.346518 | -0.193660 |
| 4 | 0.180223 | -0.559458 | -0.798003 | -1.216384 | 0.181963 |
| ... | ... | ... | ... | ... | ... |
| 95 | 0.921482 | -5.040741 | -4.184903 | 4.600800 | 3.332264 |
| 96 | 0.850969 | -5.512062 | -4.320107 | 4.708977 | 3.669605 |
| 97 | 1.243833 | -5.509274 | -4.380339 | 4.768982 | 3.195507 |
| 98 | 0.787162 | -5.475033 | -4.460634 | 5.073610 | 2.797020 |
| 99 | 1.210299 | -5.366726 | -4.693334 | 4.760788 | 2.598245 |
100 rows × 5 columns
((96, 4, 5),
(96,),
((#77) [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19...],
(#19) [77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]))
| col1 | col2 | target | |
|---|---|---|---|
| 0 | 0.0 | 0.0 | 0 |
| 1 | 0.1 | 0.1 | 1 |
| 2 | 0.2 | 0.2 | 2 |
| 3 | 0.3 | 0.3 | 3 |
| 4 | 0.4 | 0.4 | 4 |
| 5 | 0.5 | 0.5 | 5 |
| 6 | 0.6 | 0.6 | 6 |
| 7 | 0.7 | 0.7 | 7 |
| 8 | 0.8 | 0.8 | 8 |
| 9 | 0.9 | 0.9 | 9 |
| 10 | 1.0 | 1.0 | 10 |
| sample_id | var1 | var2 | target | |
|---|---|---|---|---|
| 0 | 1 | 0.0 | 0.0 | 0 |
| 1 | 1 | 1.0 | 10.0 | 1 |
| 2 | 1 | 2.0 | 20.0 | 2 |
| 3 | 1 | 3.0 | 30.0 | 3 |
| 4 | 1 | 4.0 | 40.0 | 4 |
| 5 | 1 | 5.0 | 50.0 | 5 |
| 6 | 1 | 6.0 | 60.0 | 6 |
| 7 | 1 | 7.0 | 70.0 | 7 |
| 8 | 1 | 8.0 | 80.0 | 8 |
| 9 | 1 | 9.0 | 90.0 | 9 |
| 10 | 1 | 10.0 | 100.0 | 10 |
| 11 | 1 | 11.0 | 110.0 | 11 |
| 12 | 1 | 12.0 | 120.0 | 12 |
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y(array([[[ 4., 5., 6.],
[40., 50., 60.]],
[[ 6., 7., 8.],
[60., 70., 80.]]]),
array([[ 7, 8, 9, 10],
[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y(array([[[nan, 3., 4.],
[nan, 30., 40.]],
[[ 4., 5., 6.],
[40., 50., 60.]],
[[ 6., 7., 8.],
[60., 70., 80.]]]),
array([[ 5, 6, 7, 8],
[ 7, 8, 9, 10],
[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y(array([[[ 3., 4., 5.],
[30., 40., 50.]],
[[ 5., 6., 7.],
[50., 60., 70.]]]),
array([[ 6, 7, 8, 9],
[ 8, 9, 10, 11]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y(array([[[ 3., 4., 5.],
[30., 40., 50.]],
[[ 5., 6., 7.],
[50., 60., 70.]],
[[ 7., 8., 9.],
[70., 80., 90.]]]),
array([[ 6., 7., 8., 9.],
[ 8., 9., 10., 11.],
[10., 11., 12., nan]]))
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y(array([[[nan, nan, nan, nan, 3., 4., 5., 6., 7., 8.],
[nan, nan, nan, nan, 30., 40., 50., 60., 70., 80.]]]),
array([[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y(array([[[ 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.],
[ 30., 40., 50., 60., 70., 80., 90., 100., 110., 120.]]]),
array([[nan, nan, nan, nan]]))
def SlidingWindowPanel(
window_len:int, unique_id_cols:list, stride:Union[None, int]=1, start:int=0, pad_remainder:bool=False,
padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[None, int, list]=None,
get_y:Union[None, int, list]=None, y_func:Optional[callable]=None, output_processor:Optional[callable]=None,
copy:bool=False, horizon:Union[int, list]=1, seq_first:bool=True, sort_by:Optional[list]=None,
ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True
):
Applies a sliding window to a pd.DataFrame.
Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = ‘pre’ or ‘post’ (optional, defaults to ‘pre’): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence (‘pre’ from the end, and ‘post’ from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0. * 0 for last step in each sub-window. * n > 0 for a range of n future steps (1 to n). * n < 0 for a range of n past steps (-n + 1 to 0). * list : for those exact timesteps. get_x = indices of columns that contain the independent variable (xs). If None, all data will be used as x. get_y = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data). y_func = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1! output_processor = optional function to filter output (X (and y if available)). This is useful when some values need to be removed. The function should take X and y (even if it’s None) as arguments. copy = copy the original object to avoid changes in it. seq_first = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len) sort_by = column/s used for sorting the array in ascending order ascending = used in sorting check_leakage = checks if there’s leakage in the output between X and y return_key = when True, the key corresponsing to unique_id_cols for each sample is returned verbose = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.
Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)
samples = 100_000
wl = 5
n_vars = 10
t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = pd.concat([df, df2, df3], ignore_index=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape| var_0 | var_1 | var_2 | var_3 | var_4 | var_5 | var_6 | var_7 | var_8 | var_9 | time | device | target | region | index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 34227 | 342270 | 3422700 | 34227000 | 342270000 | 3422700000 | 34227000000 | 342270000000 | 3422700000000 | 34227000000000 | 34227 | 0 | 0 | A | 34227 |
| 1 | 5831 | 58292 | 582902 | 5829000 | 58290000 | 582900000 | 5829000000 | 58290000000 | 582900000000 | 5829000000000 | 5829 | 2 | 3 | B | 105833 |
| 2 | 62373 | 623712 | 6237102 | 62371000 | 623710000 | 6237100000 | 62371000000 | 623710000000 | 6237100000000 | 62371000000000 | 62371 | 2 | 2 | B | 162375 |
| 3 | 91153 | 911512 | 9115102 | 91151000 | 911510000 | 9115100000 | 91151000000 | 911510000000 | 9115100000000 | 91151000000000 | 91151 | 2 | 3 | B | 191155 |
| 4 | 63710 | 637100 | 6371000 | 63710000 | 637100000 | 6371000000 | 63710000000 | 637100000000 | 6371000000000 | 63710000000000 | 63710 | 0 | 1 | A | 63710 |
(200004, 15)
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,), (199992,))
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199982, 10, 5), (199982,))
Identifies padded subsequences in a mask of type float
This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)
Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences
input shape: (13, 3)
tensor([[[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., nan, nan],
[ 0., 1., 2., 3., 4.],
[ 0., 0., 0., 0., 0.]],
[[nan, nan, nan, 8., 9.],
[nan, nan, nan, 8., 9.],
[nan, nan, nan, 8., 9.],
[nan, nan, nan, 0., 0.]],
[[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[ 0., 0., 0., 1., 1.]]])
tensor([[[0., 0., 0., 0., 0.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.]],
[[0., 0., 0., 1., 1.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 0., 0.]]])
def basic_data_preparation_fn(
df, # dataframe to preprocess
drop_duplicates:bool=True, # flag to indicate if rows with duplicate datetime info should be removed
datetime_col:NoneType=None, # str indicating the name of the column/s that contains the datetime info
use_index:bool=False, # flag to indicate if the datetime info is in the index
keep:str='last', # str to indicate what data should be kept in case of duplicate rows
add_missing_datetimes:bool=True, # flaf to indicate if missing datetimes should be added
freq:str='1D', # str to indicate the frequency used in the datetime info. Used in case missing timestamps exists
method:NoneType=None, # str indicating the method used to fill data for missing timestamps: None, 'bfill', 'ffill'
sort_by:NoneType=None, # str or list of str to indicate if how to sort data. If use_index=True the index will be used to sort the dataframe.
):
Call self as a function.
df_len = 100
datetime_col = 'datetime'
df = pd.DataFrame(np.arange(df_len), columns=['value'])
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
df['type'] = 1
# drop 10 rows at random
df = df.drop(df.sample(10).index)
# add 2 duplicated rows
df = pd.concat([df, df.sample(2)])
display(df)
new_df = basic_data_preparation_fn(df, drop_duplicates=True, datetime_col=datetime_col, use_index=False, keep='last',
add_missing_datetimes=True, freq='1D', method='ffill', sort_by=datetime_col)
display(new_df)| value | datetime | type | |
|---|---|---|---|
| 0 | 0 | 1749-03-31 | 1 |
| 2 | 2 | 1749-04-02 | 1 |
| 3 | 3 | 1749-04-03 | 1 |
| 4 | 4 | 1749-04-04 | 1 |
| 5 | 5 | 1749-04-05 | 1 |
| ... | ... | ... | ... |
| 97 | 97 | 1749-07-06 | 1 |
| 98 | 98 | 1749-07-07 | 1 |
| 99 | 99 | 1749-07-08 | 1 |
| 89 | 89 | 1749-06-28 | 1 |
| 4 | 4 | 1749-04-04 | 1 |
92 rows × 3 columns
| value | datetime | type | |
|---|---|---|---|
| 0 | 0 | 1749-03-31 | 1 |
| 1 | 0 | 1749-04-01 | 1 |
| 2 | 2 | 1749-04-02 | 1 |
| 3 | 3 | 1749-04-03 | 1 |
| 4 | 4 | 1749-04-04 | 1 |
| ... | ... | ... | ... |
| 95 | 95 | 1749-07-04 | 1 |
| 96 | 96 | 1749-07-05 | 1 |
| 97 | 97 | 1749-07-06 | 1 |
| 98 | 98 | 1749-07-07 | 1 |
| 99 | 99 | 1749-07-08 | 1 |
100 rows × 3 columns
Checks if the conversion to float is safe
assert check_safe_conversion(-2**11, 'float16') == True
assert check_safe_conversion(-2**11 - 1, 'float16') == False
assert check_safe_conversion(2**24, 'float32') == True
assert check_safe_conversion(2**24+1, 'float32') == False
assert check_safe_conversion(2**53, 'float64') == True
assert check_safe_conversion(2**53+1, 'float64') == False
df = pd.DataFrame({'a': [1, 2, 3], 'b': [2**24, 2**24+1, 2**24+2]})
assert not check_safe_conversion(df, 'float32')
assert check_safe_conversion(df, 'int32')
assert check_safe_conversion(df, 'float32', cols='a')
assert not check_safe_conversion(df, 'float32', cols='b')-2147483648 1 3 2147483647
-2147483648 16777216 16777218 2147483647
/var/folders/yw/1vck7tm93_z1z0bftrw65hbw0000gn/T/ipykernel_38171/874905345.py:39: UserWarning: Unsafe conversion to float32: {'a': np.True_, 'b': np.False_}
warnings.warn(f"Unsafe conversion to {dtype}: {dict(zip(cols, checks))}")
/var/folders/yw/1vck7tm93_z1z0bftrw65hbw0000gn/T/ipykernel_38171/874905345.py:39: UserWarning: Unsafe conversion to float32: {'b': np.False_}
warnings.warn(f"Unsafe conversion to {dtype}: {dict(zip(cols, checks))}")
def prepare_forecasting_data(
df:pd.DataFrame, # dataframe containing a sorted time series for a single entity or subject
fcst_history:int, # # historical steps used as input.
fcst_horizon:int=1, # # steps forecasted into the future.
x_vars:str | list=None, # features used as input. None means all columns. [] means no features.
y_vars:str | list=None, # features used as output. None means all columns. [] means no features.
dtype:str=None, # data type
unique_id_cols:str | list=None, # unique identifier column/s used in panel data
)->tuple(np.ndarray, np.ndarray):
Call self as a function.
fcst_history = 10
fcst_horizon = 5
stride = 1
valid_size=0.2
test_size=0.2
df = pd.DataFrame()
df['target'] = np.arange(50)
X, y = prepare_forecasting_data(df, fcst_history, fcst_horizon)
splits = get_forecasting_splits(df, fcst_history, fcst_horizon, valid_size=valid_size, test_size=test_size, stride=stride, show_plot=False)
assert y[splits[0]][-1][0][-1] == y[splits[1]][0][0][0] - stride
assert y[splits[1]][-1][0][-1] == y[splits[2]][0][0][0] - stride
for s,t in zip(splits, ['\ntrain_split:', '\nvalid_split:', '\ntest_split :']):
print(t)
for xi, yi in zip(X[s], y[s]):
print(xi, yi)
train_split:
[[0 1 2 3 4 5 6 7 8 9]] [[10 11 12 13 14]]
[[ 1 2 3 4 5 6 7 8 9 10]] [[11 12 13 14 15]]
[[ 2 3 4 5 6 7 8 9 10 11]] [[12 13 14 15 16]]
[[ 3 4 5 6 7 8 9 10 11 12]] [[13 14 15 16 17]]
[[ 4 5 6 7 8 9 10 11 12 13]] [[14 15 16 17 18]]
[[ 5 6 7 8 9 10 11 12 13 14]] [[15 16 17 18 19]]
[[ 6 7 8 9 10 11 12 13 14 15]] [[16 17 18 19 20]]
[[ 7 8 9 10 11 12 13 14 15 16]] [[17 18 19 20 21]]
[[ 8 9 10 11 12 13 14 15 16 17]] [[18 19 20 21 22]]
[[ 9 10 11 12 13 14 15 16 17 18]] [[19 20 21 22 23]]
[[10 11 12 13 14 15 16 17 18 19]] [[20 21 22 23 24]]
[[11 12 13 14 15 16 17 18 19 20]] [[21 22 23 24 25]]
[[12 13 14 15 16 17 18 19 20 21]] [[22 23 24 25 26]]
[[13 14 15 16 17 18 19 20 21 22]] [[23 24 25 26 27]]
[[14 15 16 17 18 19 20 21 22 23]] [[24 25 26 27 28]]
[[15 16 17 18 19 20 21 22 23 24]] [[25 26 27 28 29]]
valid_split:
[[20 21 22 23 24 25 26 27 28 29]] [[30 31 32 33 34]]
[[21 22 23 24 25 26 27 28 29 30]] [[31 32 33 34 35]]
[[22 23 24 25 26 27 28 29 30 31]] [[32 33 34 35 36]]
[[23 24 25 26 27 28 29 30 31 32]] [[33 34 35 36 37]]
[[24 25 26 27 28 29 30 31 32 33]] [[34 35 36 37 38]]
[[25 26 27 28 29 30 31 32 33 34]] [[35 36 37 38 39]]
test_split :
[[30 31 32 33 34 35 36 37 38 39]] [[40 41 42 43 44]]
[[31 32 33 34 35 36 37 38 39 40]] [[41 42 43 44 45]]
[[32 33 34 35 36 37 38 39 40 41]] [[42 43 44 45 46]]
[[33 34 35 36 37 38 39 40 41 42]] [[43 44 45 46 47]]
[[34 35 36 37 38 39 40 41 42 43]] [[44 45 46 47 48]]
[[35 36 37 38 39 40 41 42 43 44]] [[45 46 47 48 49]]
fcst_history = 10
fcst_horizon = 5
stride = 1
valid_size=0.2
test_size=0.2
df = pd.DataFrame()
df['target'] = np.arange(50)
X, y = prepare_forecasting_data(df, fcst_history, fcst_horizon, x_vars=None, y_vars=[])
splits = get_forecasting_splits(df, fcst_history, fcst_horizon, valid_size=valid_size, test_size=test_size, stride=stride, show_plot=False)
assert y is Nonedf_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**i).astype(np.float32)
display(df)
fcst_history = 10
fcst_horizon = 5
x_vars = df.columns
y_vars = None
dtype = None
X, y = prepare_forecasting_data(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars, dtype=dtype)
test_eq(X.shape, (86, 3, 10))
test_eq(y.shape, (86, 3, 5))
test_eq(y[:3, :, 0], X[:3, :, -1] + np.array([1, 10, 100]).reshape(1, 1, -1))
print(X[:3].astype(int))
print(y[:3].astype(int))| value_0 | value_1 | value_2 | |
|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 |
| 1 | 1.0 | 10.0 | 100.0 |
| 2 | 2.0 | 20.0 | 200.0 |
| 3 | 3.0 | 30.0 | 300.0 |
| 4 | 4.0 | 40.0 | 400.0 |
| ... | ... | ... | ... |
| 95 | 95.0 | 950.0 | 9500.0 |
| 96 | 96.0 | 960.0 | 9600.0 |
| 97 | 97.0 | 970.0 | 9700.0 |
| 98 | 98.0 | 980.0 | 9800.0 |
| 99 | 99.0 | 990.0 | 9900.0 |
100 rows × 3 columns
[[[ 0 1 2 3 4 5 6 7 8 9]
[ 0 10 20 30 40 50 60 70 80 90]
[ 0 100 200 300 400 500 600 700 800 900]]
[[ 1 2 3 4 5 6 7 8 9 10]
[ 10 20 30 40 50 60 70 80 90 100]
[ 100 200 300 400 500 600 700 800 900 1000]]
[[ 2 3 4 5 6 7 8 9 10 11]
[ 20 30 40 50 60 70 80 90 100 110]
[ 200 300 400 500 600 700 800 900 1000 1100]]]
[[[ 10 11 12 13 14]
[ 100 110 120 130 140]
[1000 1100 1200 1300 1400]]
[[ 11 12 13 14 15]
[ 110 120 130 140 150]
[1100 1200 1300 1400 1500]]
[[ 12 13 14 15 16]
[ 120 130 140 150 160]
[1200 1300 1400 1500 1600]]]
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
df['type'] = np.random.randint(0, 4, df_len)
df['target'] = np.arange(df_len)
display(df)
fcst_history = 10
fcst_horizon = 5
x_vars = ['value_0', 'value_1', 'value_2', 'target']
y_vars = 'target'
dtype = np.float32
X, y = prepare_forecasting_data(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars, dtype=dtype)
test_eq(X.shape, (86, 4, 10))
test_eq(y.shape, (86, 1, 5))
print(X[:3].astype(int))
print(y[:3])| value_0 | value_1 | value_2 | datetime | type | target | |
|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 1749-03-31 | 3 | 0 |
| 1 | 10.0 | 100.0 | 1000.0 | 1749-04-01 | 1 | 1 |
| 2 | 20.0 | 200.0 | 2000.0 | 1749-04-02 | 3 | 2 |
| 3 | 30.0 | 300.0 | 3000.0 | 1749-04-03 | 3 | 3 |
| 4 | 40.0 | 400.0 | 4000.0 | 1749-04-04 | 3 | 4 |
| ... | ... | ... | ... | ... | ... | ... |
| 95 | 950.0 | 9500.0 | 95000.0 | 1749-07-04 | 1 | 95 |
| 96 | 960.0 | 9600.0 | 96000.0 | 1749-07-05 | 2 | 96 |
| 97 | 970.0 | 9700.0 | 97000.0 | 1749-07-06 | 2 | 97 |
| 98 | 980.0 | 9800.0 | 98000.0 | 1749-07-07 | 0 | 98 |
| 99 | 990.0 | 9900.0 | 99000.0 | 1749-07-08 | 3 | 99 |
100 rows × 6 columns
[[[ 0 10 20 30 40 50 60 70 80 90]
[ 0 100 200 300 400 500 600 700 800 900]
[ 0 1000 2000 3000 4000 5000 6000 7000 8000 9000]
[ 0 1 2 3 4 5 6 7 8 9]]
[[ 10 20 30 40 50 60 70 80 90 100]
[ 100 200 300 400 500 600 700 800 900 1000]
[ 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000]
[ 1 2 3 4 5 6 7 8 9 10]]
[[ 20 30 40 50 60 70 80 90 100 110]
[ 200 300 400 500 600 700 800 900 1000 1100]
[ 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000]
[ 2 3 4 5 6 7 8 9 10 11]]]
[[[10. 11. 12. 13. 14.]]
[[11. 12. 13. 14. 15.]]
[[12. 13. 14. 15. 16.]]]
Call self as a function.
Define fcst start and end dates
Make sure datetime column or index is of the right date type.
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
set_df_datetime(df, datetime_col=datetime_col)
assert pd.api.types.is_datetime64_any_dtype(df['datetime'])
df_index = df.set_index('datetime')
set_df_datetime(df_index, use_index=True)
assert pd.api.types.is_datetime64_any_dtype(df_index.index)Returns the start date and and dates used by the forecast
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
test_eq(get_df_datetime_bounds(df, datetime_col=datetime_col), (df['datetime'].min(), df['datetime'].max()))
df_index = df.set_index('datetime')
test_eq(get_df_datetime_bounds(df_index, use_index=True), (df_index.index.min(), df_index.index.max()))
def get_fcst_bounds(
df, # dataframe containing forecasting data
fcst_datetime, # datetime for which a fcst is created. Optionally tuple of datatimes if the fcst is created for a range of dates.
fcst_history:NoneType=None, # # steps used as input
fcst_horizon:NoneType=None, # # predicted steps
freq:str='D', # datetime units. May contain a letters only or a combination of ints + letters: eg. "7D"
datetime_format:str='%Y-%m-%d', # format used to convert "today"
datetime_col:NoneType=None, # str data column containing the datetime
use_index:bool=False, # bool flag to indicate if index should be used to get column
):
Returns the start and end datetimes used by the forecast
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
today = pd.Timestamp(get_today()).floor(freq)
df['datetime'] = pd.date_range(None, today, periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = max_dt - timedelta(weeks=fcst_horizon)
print('fcst_datetime :', fcst_datetime)
start_datetime, end_datetime = get_fcst_bounds(df, fcst_datetime, datetime_col=datetime_col, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq)
print('start_datetime:', start_datetime)
print('end_datetime :', end_datetime)
dates = pd.date_range(start_datetime, end_datetime, freq=freq)
print(dates)
test_eq(len(dates), fcst_history + fcst_horizon)
test_eq(end_datetime, max_dt)| value_0 | value_1 | value_2 | datetime | |
|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 2024-04-11 |
| 1 | 10.0 | 100.0 | 1000.0 | 2024-04-18 |
| 2 | 20.0 | 200.0 | 2000.0 | 2024-04-25 |
| 3 | 30.0 | 300.0 | 3000.0 | 2024-05-02 |
| 4 | 40.0 | 400.0 | 4000.0 | 2024-05-09 |
| ... | ... | ... | ... | ... |
| 95 | 950.0 | 9500.0 | 95000.0 | 2026-02-05 |
| 96 | 960.0 | 9600.0 | 96000.0 | 2026-02-12 |
| 97 | 970.0 | 9700.0 | 97000.0 | 2026-02-19 |
| 98 | 980.0 | 9800.0 | 98000.0 | 2026-02-26 |
| 99 | 990.0 | 9900.0 | 99000.0 | 2026-03-05 |
100 rows × 4 columns
fcst_datetime : 2025-12-25 00:00:00
start_datetime: 2025-06-05 00:00:00
end_datetime : 2026-03-05 00:00:00
DatetimeIndex(['2025-06-05', '2025-06-12', '2025-06-19', '2025-06-26',
'2025-07-03', '2025-07-10', '2025-07-17', '2025-07-24',
'2025-07-31', '2025-08-07', '2025-08-14', '2025-08-21',
'2025-08-28', '2025-09-04', '2025-09-11', '2025-09-18',
'2025-09-25', '2025-10-02', '2025-10-09', '2025-10-16',
'2025-10-23', '2025-10-30', '2025-11-06', '2025-11-13',
'2025-11-20', '2025-11-27', '2025-12-04', '2025-12-11',
'2025-12-18', '2025-12-25', '2026-01-01', '2026-01-08',
'2026-01-15', '2026-01-22', '2026-01-29', '2026-02-05',
'2026-02-12', '2026-02-19', '2026-02-26', '2026-03-05'],
dtype='datetime64[ns]', freq='7D')
def filter_df_by_datetime(
df, # dataframe containing forecasting data
start_datetime:NoneType=None, # lower datetime bound
end_datetime:NoneType=None, # upper datetime bound
datetime_col:NoneType=None, # str data column containing the datetime
use_index:bool=False, # bool flag to indicate if index should be used to get column
):
Call self as a function.
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
df['datetime'] = pd.date_range(None, pd.Timestamp(get_today()).floor(freq), periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = pd.date_range(end=fcst_datetime, periods=fcst_horizon + 1, freq=freq).floor(freq)[-1]
start_datetime, end_datetime = get_fcst_bounds(df, fcst_datetime, datetime_col=datetime_col, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq)
test_eq(len(filter_df_by_datetime(df, start_datetime=start_datetime, end_datetime=end_datetime, datetime_col=datetime_col)), fcst_history + fcst_horizon)| value_0 | value_1 | value_2 | datetime | |
|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 2024-04-11 |
| 1 | 10.0 | 100.0 | 1000.0 | 2024-04-18 |
| 2 | 20.0 | 200.0 | 2000.0 | 2024-04-25 |
| 3 | 30.0 | 300.0 | 3000.0 | 2024-05-02 |
| 4 | 40.0 | 400.0 | 4000.0 | 2024-05-09 |
| ... | ... | ... | ... | ... |
| 95 | 950.0 | 9500.0 | 95000.0 | 2026-02-05 |
| 96 | 960.0 | 9600.0 | 96000.0 | 2026-02-12 |
| 97 | 970.0 | 9700.0 | 97000.0 | 2026-02-19 |
| 98 | 980.0 | 9800.0 | 98000.0 | 2026-02-26 |
| 99 | 990.0 | 9900.0 | 99000.0 | 2026-03-05 |
100 rows × 4 columns
def get_fcst_data_from_df(
df, # dataframe containing forecasting data
fcst_datetime, # datetime for which a fcst is created. Optionally tuple of datatimes if the fcst is created for a range of dates.
fcst_history:NoneType=None, # # steps used as input
fcst_horizon:NoneType=None, # # predicted steps
freq:str='D', # datetime units. May contain a letters only or a combination of ints + letters: eg. "7D"
datetime_format:str='%Y-%m-%d', # format used to convert "today"
datetime_col:NoneType=None, # str data column containing the datetime
use_index:bool=False, # bool flag to indicate if index should be used to get column
):
Get forecasting data from a dataframe
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
df['datetime'] = pd.date_range(None, pd.Timestamp(get_today()).floor(freq), periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = pd.date_range(end=fcst_datetime, periods=fcst_horizon + 1, freq=freq).floor(freq)[-1]
test_eq(len(get_fcst_data_from_df(df, fcst_datetime, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq, datetime_col=datetime_col)),
fcst_history + fcst_horizon)| value_0 | value_1 | value_2 | datetime | |
|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 2024-04-11 |
| 1 | 10.0 | 100.0 | 1000.0 | 2024-04-18 |
| 2 | 20.0 | 200.0 | 2000.0 | 2024-04-25 |
| 3 | 30.0 | 300.0 | 3000.0 | 2024-05-02 |
| 4 | 40.0 | 400.0 | 4000.0 | 2024-05-09 |
| ... | ... | ... | ... | ... |
| 95 | 950.0 | 9500.0 | 95000.0 | 2026-02-05 |
| 96 | 960.0 | 9600.0 | 96000.0 | 2026-02-12 |
| 97 | 970.0 | 9700.0 | 97000.0 | 2026-02-19 |
| 98 | 980.0 | 9800.0 | 98000.0 | 2026-02-26 |
| 99 | 990.0 | 9900.0 | 99000.0 | 2026-03-05 |
100 rows × 4 columns