= np.arange(20).reshape(-1,1).repeat(3, 1) * np.array([1, 10, 100])
data = pd.DataFrame(data, columns=['feat_1', 'feat_2', 'feat_3'])
df df.head()
feat_1 | feat_2 | feat_3 | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 10 | 100 |
2 | 2 | 20 | 200 |
3 | 3 | 30 | 300 |
4 | 4 | 40 | 400 |
Functions required to prepare X (and y) from a pandas dataframe.
apply_sliding_window (data, window_len:Union[int,list], horizon:Union[int,list]=0, x_vars:Union[int,list]=None, y_vars:Union[int,list]=None)
Applies a sliding window on an array-like input to generate a 3d X (and optionally y)
Type | Default | Details | |
---|---|---|---|
data | and array-like object with the input data | ||
window_len | int | list | sliding window length. When using a list, use negative numbers and 0. | |
horizon | int | list | 0 | horizon |
x_vars | int | list | None | indices of the independent variables |
y_vars | int | list | None | indices of the dependent variables (target). [] means no y will be created. None means all variables. |
prepare_sel_vars_and_steps (sel_vars=None, sel_steps=None, idxs=False)
prepare_idxs (o, shape=None)
data = np.arange(20).reshape(-1,1).repeat(3, 1) * np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['feat_1', 'feat_2', 'feat_3'])
df.head()
feat_1 | feat_2 | feat_3 | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 10 | 100 |
2 | 2 | 20 | 200 |
3 | 3 | 30 | 300 |
4 | 4 | 40 | 400 |
window_len = 8
horizon = 1
x_vars = None
y_vars = None
X, y = apply_sliding_window(data, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, data))
print(np.shares_memory(y, data))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + horizon), df.shape[1], window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + horizon), df.shape[1]))
X[0], y[0]
True
True
(12, 3, 8) (12, 3)
(array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 0, 10, 20, 30, 40, 50, 60, 70],
[ 0, 100, 200, 300, 400, 500, 600, 700]]),
array([ 8, 80, 800]))
window_len = 8
horizon = 1
x_vars = None
y_vars = 0
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + horizon), df.shape[1], window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + horizon),))
X[0], y[0]
True
True
(12, 3, 8) (12,)
(array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 0, 10, 20, 30, 40, 50, 60, 70],
[ 0, 100, 200, 300, 400, 500, 600, 700]]),
8)
window_len = 8
horizon = [1, 2]
x_vars = 0
y_vars = [1, 2]
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (len(df) - (window_len - 1 + max(horizon)), 1, window_len))
test_eq(y.shape, (len(df) - (window_len - 1 + max(horizon)), len(y_vars), len(horizon)))
X[0], y[0]
True
False
(11, 1, 8) (11, 2, 2)
(array([[0, 1, 2, 3, 4, 5, 6, 7]]),
array([[ 80, 90],
[800, 900]]))
window_len = [-4, -2, -1, 0]
horizon = [1, 2, 4]
x_vars = 0
y_vars = [1, 2]
X, y = apply_sliding_window(df, window_len, horizon=horizon, x_vars=x_vars, y_vars=y_vars)
print(np.shares_memory(X, df))
print(np.shares_memory(y, df))
print(X.shape, y.shape)
test_eq(X.shape, (12, 1, 4))
test_eq(y.shape, (12, 2, 3))
X[0], y[0]
False
False
(12, 1, 4) (12, 2, 3)
(array([[0, 2, 3, 4]]),
array([[ 50, 60, 80],
[500, 600, 800]]))
df2Xy (df, sample_col=None, feat_col=None, data_cols=None, target_col=None, steps_in_rows=False, to3d=True, splits=None, sort_by=None, ascending=True, y_func=None, return_names=False)
This function allows you to transform a pandas dataframe into X and y numpy arrays that can be used to create a TSDataset. sample_col: column that uniquely identifies each sample. feat_col: used for multivariate datasets. It indicates which is the column that indicates the feature by row. data_col: indicates ths column/s where the data is located. If None, it means all columns (except the sample_col, feat_col, and target_col) target_col: indicates the column/s where the target is. steps_in_rows: flag to indicate if each step is in a different row or in a different column (default). to3d: turns X to 3d (including univariate time series) sort_by: this is used to pass any colum/s that are needed to sort the steps in the sequence. If you pass a sample_col and/ or feat_col these will be automatically used before the sort_by column/s, and you don’t need to add them to the sort_by column/s list. y_func: function used to calculate y for each sample (and target_col) return_names: flag to return the names of the columns from where X was generated
split_Xy (X, y=None, splits=None)
df = pd.DataFrame()
df['sample_id'] = np.array([1,1,1,2,2,2,3,3,3])
df['var1'] = df['sample_id'] * 10 + df.index.values
df['var2'] = df['sample_id'] * 100 + df.index.values
df
sample_id | var1 | var2 | |
---|---|---|---|
0 | 1 | 10 | 100 |
1 | 1 | 11 | 101 |
2 | 1 | 12 | 102 |
3 | 2 | 23 | 203 |
4 | 2 | 24 | 204 |
5 | 2 | 25 | 205 |
6 | 3 | 36 | 306 |
7 | 3 | 37 | 307 |
8 | 3 | 38 | 308 |
n_samples = 1_000
n_rows = 10_000
sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array([0,1,2])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array([100,200,300])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = random_choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df
sample_id | feat_id | 1 | 2 | 3 | 4 | 5 | 6 | target | target2 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 625 | 2 | -1.390549 | 0.770179 | -0.848480 | 0.853631 | -0.309984 | 0.874338 | 2.0 | 2.0 |
1 | 526 | 4 | 1.152397 | 2.064397 | -0.392603 | -0.275797 | -0.047526 | -2.248814 | 2.0 | 2.0 |
2 | 397 | 6 | -1.052930 | 0.631396 | -0.758800 | -0.606483 | -2.776054 | -0.457755 | 1.0 | 1.0 |
3 | 528 | 8 | -0.178637 | -1.253319 | -1.154014 | 0.913876 | 1.051010 | -0.635762 | 1.0 | 1.0 |
4 | 249 | 2 | 0.612595 | 0.888297 | 0.065024 | 1.621935 | -0.180479 | 0.309977 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 272 | 1 | -0.432325 | 1.645262 | 1.502872 | -1.144859 | 0.919653 | 0.414304 | 0.0 | 0.0 |
9996 | 920 | 5 | -0.724702 | -1.471832 | 1.209086 | 1.206532 | 0.555676 | 0.352726 | 2.0 | 2.0 |
9997 | 662 | 6 | 1.122043 | -0.379357 | -0.344517 | -1.545091 | 0.187894 | 1.062510 | 2.0 | 2.0 |
9998 | 71 | 7 | -0.053582 | -0.854992 | -1.118632 | -1.967820 | -0.344804 | 0.128105 | 0.0 | 0.0 |
9999 | 407 | 4 | -1.565716 | -0.947183 | -0.401944 | -1.309024 | -0.237755 | -0.743251 | 2.0 | 2.0 |
10000 rows × 10 columns
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], sort_by=['sample_id', 'feat_id'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id'], kind='stable').reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])
TESTDATA = StringIO("""sample_id;value_0;value_1;target
rob;2;3;0
alice;6;7;1
eve;11;12;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y
sample_id | value_0 | value_1 | target | |
---|---|---|---|---|
0 | rob | 2 | 3 | 0 |
1 | alice | 6 | 7 | 1 |
2 | eve | 11 | 12 | 2 |
(array([[[ 6, 7]],
[[11, 12]],
[[ 2, 3]]]),
array([1, 2, 0]))
# Univariate
TESTDATA = StringIO("""sample_id;timestep;values;target
rob;1;2;0
alice;1;6;1
eve;1;11;2
rob;2;3;0
alice;2;7;1
eve;2;12;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)
sample_id | timestep | values | target | |
---|---|---|---|---|
0 | rob | 1 | 2 | 0 |
1 | alice | 1 | 6 | 1 |
2 | eve | 1 | 11 | 2 |
3 | rob | 2 | 3 | 0 |
4 | alice | 2 | 7 | 1 |
5 | eve | 2 | 12 | 2 |
[[[ 6 7]]
[[11 12]]
[[ 2 3]]] [1 2 0]
# Multivariate
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
rob;green;2;3;0
rob;yellow;3;4;0
rob;blue;4;5;0
rob;red;5;6;0
alice;green;6;7;1
alice;yellow;7;8;1
alice;blue;8;9;1
alice;red;9;10;1
eve;yellow;11;12;2
eve;green;10;11;2
eve;blue;12;12;2
eve;red;13;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
idx = random_choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))
sample_id | trait | value_0 | value_1 | target | |
---|---|---|---|---|---|
9 | eve | green | 10 | 11 | 2 |
10 | eve | blue | 12 | 12 | 2 |
3 | rob | red | 5 | 6 | 0 |
0 | rob | green | 2 | 3 | 0 |
6 | alice | blue | 8 | 9 | 1 |
2 | rob | blue | 4 | 5 | 0 |
1 | rob | yellow | 3 | 4 | 0 |
4 | alice | green | 6 | 7 | 1 |
7 | alice | red | 9 | 10 | 1 |
8 | eve | yellow | 11 | 12 | 2 |
11 | eve | red | 13 | 14 | 2 |
5 | alice | yellow | 7 | 8 | 1 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [1 2 0]
# Multivariate, multi-label
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
rob;green;2;3;0;0
rob;yellow;3;4;0;0
rob;blue;4;5;0;0
rob;red;5;6;0;0
alice;green;6;7;1;0
alice;yellow;7;8;1;0
alice;blue;8;9;1;0
alice;red;9;10;1;0
eve;yellow;11;12;2;1
eve;green;10;11;2;1
eve;blue;12;12;2;1
eve;red;13;14;2;1
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)
sample_id | trait | value_0 | value_1 | target1 | target2 | |
---|---|---|---|---|---|---|
0 | rob | green | 2 | 3 | 0 | 0 |
1 | rob | yellow | 3 | 4 | 0 | 0 |
2 | rob | blue | 4 | 5 | 0 | 0 |
3 | rob | red | 5 | 6 | 0 | 0 |
4 | alice | green | 6 | 7 | 1 | 0 |
5 | alice | yellow | 7 | 8 | 1 | 0 |
6 | alice | blue | 8 | 9 | 1 | 0 |
7 | alice | red | 9 | 10 | 1 | 0 |
8 | eve | yellow | 11 | 12 | 2 | 1 |
9 | eve | green | 10 | 11 | 2 | 1 |
10 | eve | blue | 12 | 12 | 2 | 1 |
11 | eve | red | 13 | 14 | 2 | 1 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [[1 0]
[2 1]
[0 0]]
# Multivariate, unlabeled
TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
rob;green;2;3;0
rob;yellow;3;4;0
rob;blue;4;5;0
rob;red;5;6;0
alice;green;6;7;1
alice;yellow;7;8;1
alice;blue;8;9;1
alice;red;9;10;1
eve;yellow;11;12;2
eve;green;10;11;2
eve;blue;12;12;2
eve;red;13;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
idx = random_choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return mode(o, axis=1, keepdims=True).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)
sample_id | trait | value_0 | value_1 | target | |
---|---|---|---|---|---|
11 | eve | red | 13 | 14 | 2 |
3 | rob | red | 5 | 6 | 0 |
9 | eve | green | 10 | 11 | 2 |
10 | eve | blue | 12 | 12 | 2 |
6 | alice | blue | 8 | 9 | 1 |
1 | rob | yellow | 3 | 4 | 0 |
4 | alice | green | 6 | 7 | 1 |
2 | rob | blue | 4 | 5 | 0 |
0 | rob | green | 2 | 3 | 0 |
8 | eve | yellow | 11 | 12 | 2 |
7 | alice | red | 9 | 10 | 1 |
5 | alice | yellow | 7 | 8 | 1 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 12]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] None
TESTDATA = StringIO("""sample_id;trait;timestep;values;target
rob;green;1;2;0
rob;yellow;1;3;0
rob;blue;1;4;0
rob;red;1;5;0
alice;green;1;6;1
alice;yellow;1;7;1
alice;blue;1;8;1
alice;red;1;9;1
eve;yellow;1;11;2
eve;green;1;10;2
eve;blue;1;12;2
eve;red;1;13;2
rob;green;2;3;0
rob;yellow;2;4;0
rob;blue;2;5;0
rob;red;2;6;0
alice;green;2;7;1
alice;yellow;2;8;1
alice;blue;2;9;1
alice;red;2;10;1
eve;yellow;2;12;2
eve;green;2;11;2
eve;blue;2;13;2
eve;red;2;14;2
""")
df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))
sample_id | trait | timestep | values | target | |
---|---|---|---|---|---|
0 | rob | green | 1 | 2 | 0 |
1 | rob | yellow | 1 | 3 | 0 |
2 | rob | blue | 1 | 4 | 0 |
3 | rob | red | 1 | 5 | 0 |
4 | alice | green | 1 | 6 | 1 |
5 | alice | yellow | 1 | 7 | 1 |
6 | alice | blue | 1 | 8 | 1 |
7 | alice | red | 1 | 9 | 1 |
8 | eve | yellow | 1 | 11 | 2 |
9 | eve | green | 1 | 10 | 2 |
10 | eve | blue | 1 | 12 | 2 |
11 | eve | red | 1 | 13 | 2 |
12 | rob | green | 2 | 3 | 0 |
13 | rob | yellow | 2 | 4 | 0 |
14 | rob | blue | 2 | 5 | 0 |
15 | rob | red | 2 | 6 | 0 |
16 | alice | green | 2 | 7 | 1 |
17 | alice | yellow | 2 | 8 | 1 |
18 | alice | blue | 2 | 9 | 1 |
19 | alice | red | 2 | 10 | 1 |
20 | eve | yellow | 2 | 12 | 2 |
21 | eve | green | 2 | 11 | 2 |
22 | eve | blue | 2 | 13 | 2 |
23 | eve | red | 2 | 14 | 2 |
[[[ 8 9]
[ 6 7]
[ 9 10]
[ 7 8]]
[[12 13]
[10 11]
[13 14]
[11 12]]
[[ 4 5]
[ 2 3]
[ 5 6]
[ 3 4]]] [1 2 0]
df2np3d (df, groupby, data_cols=None)
Transforms a df (with the same number of rows per group in groupby) to a 3d ndarray
add_missing_value_cols (df, cols=None, dtype=<class 'float'>, fill_value=None)
data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df
A | B | missing_A | missing_B | |
---|---|---|---|---|
0 | 0.476712 | -0.880797 | 0.0 | 0.0 |
1 | NaN | -1.517210 | 1.0 | 0.0 |
2 | -1.348997 | -0.878441 | 0.0 | 0.0 |
3 | NaN | 0.290756 | 1.0 | 0.0 |
4 | 0.569218 | -1.415777 | 0.0 | 0.0 |
5 | 0.591641 | -2.133860 | 0.0 | 0.0 |
6 | NaN | NaN | 1.0 | 1.0 |
7 | NaN | -0.119397 | 1.0 | 0.0 |
8 | -0.727988 | 0.057254 | 0.0 | 0.0 |
9 | -0.631352 | -0.219028 | 0.0 | 0.0 |
add_missing_timestamps (df, datetime_col=None, use_index=False, unique_id_cols=None, groupby=None, fill_value=nan, range_by_group=True, start_date=None, end_date=None, freq=None)
Type | Default | Details | |
---|---|---|---|
df | pandas DataFrame | ||
datetime_col | NoneType | None | column that contains the datetime data (without duplicates within groups) |
use_index | bool | False | indicates if the index contains the datetime data |
unique_id_cols | NoneType | None | column used to identify unique_ids |
groupby | NoneType | None | same as unique_id_cols. Will be deprecated. Kept for compatiblity. |
fill_value | float | nan | values that will be insert where missing dates exist. Default:np.nan |
range_by_group | bool | True | if True, dates will be filled between min and max dates for each group. Otherwise, between the min and max dates in the df. |
start_date | NoneType | None | start date to fill in missing dates (same for all unique_ids) |
end_date | NoneType | None | end date to fill in missing dates (same for all unique_ids) |
freq | NoneType | None | frequency used to fill in the missing datetime |
# Filling dates between min and max dates
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date | feature1 | feature2 | |
---|---|---|---|
0 | 2021-05-01 | 0.537248 | 0.670897 |
1 | 2021-05-03 | 0.299912 | 0.421039 |
2 | 2021-05-05 | 0.648372 | 0.204641 |
3 | 2021-05-06 | 0.017475 | 0.022183 |
4 | 2021-05-07 | 0.965919 | 0.470055 |
# No groups
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols=None,
fill_value=np.nan,
range_by_group=False)
test_eq(output_df, expected_output_df)
date | feature1 | feature2 | |
---|---|---|---|
0 | 2021-05-01 | 0.537248 | 0.670897 |
1 | 2021-05-02 | NaN | NaN |
2 | 2021-05-03 | 0.299912 | 0.421039 |
3 | 2021-05-04 | NaN | NaN |
4 | 2021-05-05 | 0.648372 | 0.204641 |
5 | 2021-05-06 | 0.017475 | 0.022183 |
6 | 2021-05-07 | 0.965919 | 0.470055 |
# Filling dates between min and max dates for each value in groupby column
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,11,13]).reset_index(drop=True)
date_df_with_missing_dates
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-03 | 0 | 0.059398 | 0.255853 |
1 | 2021-05-05 | 0 | 0.235536 | 0.455261 |
2 | 2021-05-06 | 0 | 0.724423 | 0.280910 |
3 | 2021-05-07 | 0 | 0.303682 | 0.853959 |
4 | 2021-05-01 | 1 | 0.022424 | 0.408510 |
5 | 2021-05-03 | 1 | 0.508190 | 0.603880 |
6 | 2021-05-04 | 1 | 0.330924 | 0.108156 |
7 | 2021-05-06 | 1 | 0.601481 | 0.020182 |
# groupby='id', range_by_group=True
expected_output_df = date_df.drop([0,1,13]).reset_index(drop=True)
expected_output_df.loc[[1,6,9], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols='id',
fill_value=np.nan,
range_by_group=True)
test_eq(expected_output_df, output_df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-03 | 0 | 0.059398 | 0.255853 |
1 | 2021-05-04 | 0 | NaN | NaN |
2 | 2021-05-05 | 0 | 0.235536 | 0.455261 |
3 | 2021-05-06 | 0 | 0.724423 | 0.280910 |
4 | 2021-05-07 | 0 | 0.303682 | 0.853959 |
5 | 2021-05-01 | 1 | 0.022424 | 0.408510 |
6 | 2021-05-02 | 1 | NaN | NaN |
7 | 2021-05-03 | 1 | 0.508190 | 0.603880 |
8 | 2021-05-04 | 1 | 0.330924 | 0.108156 |
9 | 2021-05-05 | 1 | NaN | NaN |
10 | 2021-05-06 | 1 | 0.601481 | 0.020182 |
# groupby='id', range_by_group=False
expected_output_df = date_df.copy()
expected_output_df.loc[[0,1,3,8,11,13], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
unique_id_cols='id',
fill_value=np.nan,
range_by_group=False)
test_eq(expected_output_df, output_df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 | 0 | NaN | NaN |
1 | 2021-05-02 | 0 | NaN | NaN |
2 | 2021-05-03 | 0 | 0.059398 | 0.255853 |
3 | 2021-05-04 | 0 | NaN | NaN |
4 | 2021-05-05 | 0 | 0.235536 | 0.455261 |
5 | 2021-05-06 | 0 | 0.724423 | 0.280910 |
6 | 2021-05-07 | 0 | 0.303682 | 0.853959 |
7 | 2021-05-01 | 1 | 0.022424 | 0.408510 |
8 | 2021-05-02 | 1 | NaN | NaN |
9 | 2021-05-03 | 1 | 0.508190 | 0.603880 |
10 | 2021-05-04 | 1 | 0.330924 | 0.108156 |
11 | 2021-05-05 | 1 | NaN | NaN |
12 | 2021-05-06 | 1 | 0.601481 | 0.020182 |
13 | 2021-05-07 | 1 | NaN | NaN |
# Filling dates between min and max timestamps
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates
date | feature1 | feature2 | |
---|---|---|---|
0 | 2021-05-01 00:00:00 | 0.774846 | 0.624488 |
1 | 2021-05-01 08:00:00 | 0.683837 | 0.441230 |
2 | 2021-05-01 16:00:00 | 0.142269 | 0.279095 |
3 | 2021-05-01 20:00:00 | 0.953686 | 0.205123 |
# No groups
expected_output_df = date_df.copy()
expected_output_df.loc[[1,3], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(), 'date', groupby=None, fill_value=np.nan, range_by_group=False, freq='4H')
test_eq(output_df, expected_output_df)
date | feature1 | feature2 | |
---|---|---|---|
0 | 2021-05-01 00:00:00 | 0.774846 | 0.624488 |
1 | 2021-05-01 04:00:00 | NaN | NaN |
2 | 2021-05-01 08:00:00 | 0.683837 | 0.441230 |
3 | 2021-05-01 12:00:00 | NaN | NaN |
4 | 2021-05-01 16:00:00 | 0.142269 | 0.279095 |
5 | 2021-05-01 20:00:00 | 0.953686 | 0.205123 |
# Filling missing values between min and max timestamps for each value in groupby column
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,3,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 08:00:00 | 0 | 0.438784 | 0.084472 |
1 | 2021-05-01 16:00:00 | 0 | 0.059613 | 0.445215 |
2 | 2021-05-01 20:00:00 | 0 | 0.511807 | 0.001034 |
3 | 2021-05-01 00:00:00 | 1 | 0.970115 | 0.280121 |
4 | 2021-05-01 04:00:00 | 1 | 0.775051 | 0.436359 |
5 | 2021-05-01 16:00:00 | 1 | 0.469987 | 0.457442 |
# groupby='id', range_by_group=True
expected_output_df = date_df.drop([0,1,11]).reset_index(drop=True)
expected_output_df.loc[[1,6,7], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
groupby='id',
fill_value=np.nan,
range_by_group=True,
freq='4H')
test_eq(expected_output_df, output_df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 08:00:00 | 0 | 0.438784 | 0.084472 |
1 | 2021-05-01 12:00:00 | 0 | NaN | NaN |
2 | 2021-05-01 16:00:00 | 0 | 0.059613 | 0.445215 |
3 | 2021-05-01 20:00:00 | 0 | 0.511807 | 0.001034 |
4 | 2021-05-01 00:00:00 | 1 | 0.970115 | 0.280121 |
5 | 2021-05-01 04:00:00 | 1 | 0.775051 | 0.436359 |
6 | 2021-05-01 08:00:00 | 1 | NaN | NaN |
7 | 2021-05-01 12:00:00 | 1 | NaN | NaN |
8 | 2021-05-01 16:00:00 | 1 | 0.469987 | 0.457442 |
# groupby='id', range_by_group=False
expected_output_df = date_df.copy()
expected_output_df.loc[[0,1,3,8,9,11], ['feature1', 'feature2']] = np.nan
display(expected_output_df)
output_df = add_missing_timestamps(date_df_with_missing_dates.copy(),
'date',
groupby='id',
fill_value=np.nan,
range_by_group=False,
freq='4H')
test_eq(expected_output_df, output_df)
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 00:00:00 | 0 | NaN | NaN |
1 | 2021-05-01 04:00:00 | 0 | NaN | NaN |
2 | 2021-05-01 08:00:00 | 0 | 0.438784 | 0.084472 |
3 | 2021-05-01 12:00:00 | 0 | NaN | NaN |
4 | 2021-05-01 16:00:00 | 0 | 0.059613 | 0.445215 |
5 | 2021-05-01 20:00:00 | 0 | 0.511807 | 0.001034 |
6 | 2021-05-01 00:00:00 | 1 | 0.970115 | 0.280121 |
7 | 2021-05-01 04:00:00 | 1 | 0.775051 | 0.436359 |
8 | 2021-05-01 08:00:00 | 1 | NaN | NaN |
9 | 2021-05-01 12:00:00 | 1 | NaN | NaN |
10 | 2021-05-01 16:00:00 | 1 | 0.469987 | 0.457442 |
11 | 2021-05-01 20:00:00 | 1 | NaN | NaN |
# No groups, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([1,3]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby=None, fill_value=np.nan, range_by_group=False, freq='4H'), )
date | feature1 | feature2 | |
---|---|---|---|
0 | 2021-05-01 00:00:00 | 0.755092 | 0.002068 |
1 | 2021-05-01 08:00:00 | 0.570693 | 0.087019 |
2 | 2021-05-01 16:00:00 | 0.228869 | 0.856618 |
3 | 2021-05-01 16:00:00 | 0.349506 | 0.428253 |
# groupby='id', range_by_group=True, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=True, freq='4H'),
contains='cannot handle a non-unique multi-index!')
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 08:00:00 | 0 | 0.040345 | 0.312874 |
1 | 2021-05-01 12:00:00 | 0 | 0.713424 | 0.597211 |
2 | 2021-05-01 16:00:00 | 0 | 0.468382 | 0.652314 |
3 | 2021-05-01 16:00:00 | 0 | 0.396691 | 0.605664 |
4 | 2021-05-01 00:00:00 | 1 | 0.804646 | 0.964115 |
5 | 2021-05-01 04:00:00 | 1 | 0.089925 | 0.072410 |
6 | 2021-05-01 16:00:00 | 1 | 0.830786 | 0.560658 |
# groupby='id', range_by_group=FALSE, with duplicate dates ==> FAILS
dates = pd.date_range('2021-05-01 000:00', '2021-05-01 20:00', freq='4H').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 4))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
data[:, 3] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1', 'feature2']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float, 'feature2': float})
date_df_with_missing_dates = date_df.drop([0,1,8,9,11]).reset_index(drop=True)
date_df_with_missing_dates.loc[3, 'date'] = date_df_with_missing_dates.loc[2, 'date']
display(date_df_with_missing_dates)
test_fail(add_missing_timestamps, args=[date_df_with_missing_dates, 'date'], kwargs=dict(groupby='id', fill_value=np.nan, range_by_group=False, freq='4H'),
contains='cannot handle a non-unique multi-index!')
date | id | feature1 | feature2 | |
---|---|---|---|---|
0 | 2021-05-01 08:00:00 | 0 | 0.448508 | 0.953596 |
1 | 2021-05-01 12:00:00 | 0 | 0.868802 | 0.526845 |
2 | 2021-05-01 16:00:00 | 0 | 0.223070 | 0.304842 |
3 | 2021-05-01 16:00:00 | 0 | 0.645661 | 0.270956 |
4 | 2021-05-01 00:00:00 | 1 | 0.017250 | 0.787757 |
5 | 2021-05-01 04:00:00 | 1 | 0.783341 | 0.608269 |
6 | 2021-05-01 16:00:00 | 1 | 0.426247 | 0.926149 |
time_encoding (series, freq, max_val=None)
Transforms a pandas series of dtype datetime64 (of any freq) or DatetimeIndex into 2 float arrays
Available options: microsecond, millisecond, second, minute, hour, day = day_of_month = dayofmonth, day_of_week = weekday = dayofweek, day_of_year = dayofyear, week = week_of_year = weekofyear, month and year
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
tdf = pd.DataFrame(pd.date_range('2021-03-01', dt.datetime.today()), columns=['date'])
a,b = time_encoding(tdf.date, freq=freq)
plt.plot(a)
plt.plot(b)
plt.title(freq)
plt.show()
for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
dateindex = pd.date_range('2021-03-01', dt.datetime.today())
a,b = time_encoding(dateindex, freq=freq)
plt.plot(a)
plt.plot(b)
plt.title(freq)
plt.show()
dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df
date | id | feature1 | feature2 | dow_sin | dow_cos | |
---|---|---|---|---|---|---|
0 | 2021-05-01 00:00:00 | 0 | 0.773597 | 0.465634 | -0.974928 | -0.222521 |
1 | 2021-05-01 04:00:00 | 0 | 0.265526 | 0.963753 | -0.974928 | -0.222521 |
2 | 2021-05-01 08:00:00 | 0 | 0.448508 | 0.953596 | -0.974928 | -0.222521 |
3 | 2021-05-01 12:00:00 | 0 | 0.868802 | 0.526845 | -0.974928 | -0.222521 |
4 | 2021-05-01 16:00:00 | 0 | 0.223070 | 0.304842 | -0.974928 | -0.222521 |
5 | 2021-05-01 20:00:00 | 0 | 0.645661 | 0.270956 | -0.974928 | -0.222521 |
6 | 2021-05-01 00:00:00 | 1 | 0.017250 | 0.787757 | -0.974928 | -0.222521 |
7 | 2021-05-01 04:00:00 | 1 | 0.783341 | 0.608269 | -0.974928 | -0.222521 |
8 | 2021-05-01 08:00:00 | 1 | 0.629875 | 0.170726 | -0.974928 | -0.222521 |
9 | 2021-05-01 12:00:00 | 1 | 0.302927 | 0.682136 | -0.974928 | -0.222521 |
10 | 2021-05-01 16:00:00 | 1 | 0.426247 | 0.926149 | -0.974928 | -0.222521 |
11 | 2021-05-01 20:00:00 | 1 | 0.830624 | 0.543715 | -0.974928 | -0.222521 |
get_gaps (o:torch.Tensor, forward:bool=True, backward:bool=True, nearest:bool=True, normalize:bool=True)
Number of sequence steps from previous, to next and/or to nearest real value along the last dimension of 3D arrays or tensors
nearest_gaps (o, normalize=True)
Number of sequence steps to nearest real value along the last dimension of 3D arrays or tensors
backward_gaps (o, normalize=True)
Number of sequence steps to next real value along the last dimension of 3D arrays or tensors
forward_gaps (o, normalize=True)
Number of sequence steps since previous real value along the last dimension of 3D arrays or tensors
t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_ge(nearest_gaps(t).min().item(), 0)
test_ge(nearest_gaps(arr).min(), 0)
test_le(nearest_gaps(t).min().item(), 1)
test_le(nearest_gaps(arr).min(), 1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = get_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)
add_delta_timestamp_cols (df, cols=None, groupby=None, forward=True, backward=True, nearest=True, normalize=True)
# Add delta timestamp features for the no groups setting
dates = pd.date_range('2021-05-01', '2021-05-07').values
data = np.zeros((len(dates), 2))
data[:, 0] = dates
data[:, 1] = np.random.rand(len(dates))
cols = ['date', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'feature1': float})
date_df.loc[[1,3,4],'feature1'] = np.nan
date_df
date | feature1 | |
---|---|---|
0 | 2021-05-01 | 0.132532 |
1 | 2021-05-02 | NaN |
2 | 2021-05-03 | 0.403176 |
3 | 2021-05-04 | NaN |
4 | 2021-05-05 | NaN |
5 | 2021-05-06 | 0.179554 |
6 | 2021-05-07 | 0.446536 |
# No groups
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1])
display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', normalize=False)
test_eq(expected_output_df, output_df)
date | feature1 | feature1_dt_fwd | feature1_dt_bwd | feature1_dt_nearest | |
---|---|---|---|---|---|
0 | 2021-05-01 | 0.132532 | 1 | 2 | 1 |
1 | 2021-05-02 | NaN | 1 | 1 | 1 |
2 | 2021-05-03 | 0.403176 | 2 | 3 | 2 |
3 | 2021-05-04 | NaN | 1 | 2 | 1 |
4 | 2021-05-05 | NaN | 2 | 1 | 1 |
5 | 2021-05-06 | 0.179554 | 3 | 1 | 1 |
6 | 2021-05-07 | 0.446536 | 1 | 1 | 1 |
# Add delta timestamp features within a group
dates = pd.date_range('2021-05-01', '2021-05-07').values
dates = np.concatenate((dates, dates))
data = np.zeros((len(dates), 3))
data[:, 0] = dates
data[:, 1] = np.array([0]*(len(dates)//2)+[1]*(len(dates)//2))
data[:, 2] = np.random.rand(len(dates))
cols = ['date', 'id', 'feature1']
date_df = pd.DataFrame(data, columns=cols).astype({'date': 'datetime64[ns]', 'id': int, 'feature1': float})
date_df.loc[[1,3,4,8,9,11],'feature1'] = np.nan
date_df
date | id | feature1 | |
---|---|---|---|
0 | 2021-05-01 | 0 | 0.405327 |
1 | 2021-05-02 | 0 | NaN |
2 | 2021-05-03 | 0 | 0.055934 |
3 | 2021-05-04 | 0 | NaN |
4 | 2021-05-05 | 0 | NaN |
5 | 2021-05-06 | 0 | 0.698408 |
6 | 2021-05-07 | 0 | 0.064831 |
7 | 2021-05-01 | 1 | 0.407541 |
8 | 2021-05-02 | 1 | NaN |
9 | 2021-05-03 | 1 | NaN |
10 | 2021-05-04 | 1 | 0.113590 |
11 | 2021-05-05 | 1 | NaN |
12 | 2021-05-06 | 1 | 0.548088 |
13 | 2021-05-07 | 1 | 0.348813 |
# groupby='id'
expected_output_df = date_df.copy()
expected_output_df['feature1_dt_fwd'] = np.array([1,1,2,1,2,3,1,1,1,2,3,1,2,1])
expected_output_df['feature1_dt_bwd'] = np.array([2,1,3,2,1,1,1,3,2,1,2,1,1,1])
expected_output_df['feature1_dt_nearest'] = np.array([1,1,2,1,1,1,1,1,1,1,2,1,1,1])
display(expected_output_df)
output_df = add_delta_timestamp_cols(date_df, cols='feature1', groupby='id', normalize=False)
test_eq(expected_output_df, output_df)
date | id | feature1 | feature1_dt_fwd | feature1_dt_bwd | feature1_dt_nearest | |
---|---|---|---|---|---|---|
0 | 2021-05-01 | 0 | 0.405327 | 1 | 2 | 1 |
1 | 2021-05-02 | 0 | NaN | 1 | 1 | 1 |
2 | 2021-05-03 | 0 | 0.055934 | 2 | 3 | 2 |
3 | 2021-05-04 | 0 | NaN | 1 | 2 | 1 |
4 | 2021-05-05 | 0 | NaN | 2 | 1 | 1 |
5 | 2021-05-06 | 0 | 0.698408 | 3 | 1 | 1 |
6 | 2021-05-07 | 0 | 0.064831 | 1 | 1 | 1 |
7 | 2021-05-01 | 1 | 0.407541 | 1 | 3 | 1 |
8 | 2021-05-02 | 1 | NaN | 1 | 2 | 1 |
9 | 2021-05-03 | 1 | NaN | 2 | 1 | 1 |
10 | 2021-05-04 | 1 | 0.113590 | 3 | 2 | 2 |
11 | 2021-05-05 | 1 | NaN | 1 | 1 | 1 |
12 | 2021-05-06 | 1 | 0.548088 | 2 | 1 | 1 |
13 | 2021-05-07 | 1 | 0.348813 | 1 | 1 | 1 |
SlidingWindow and SlidingWindowPanel are 2 useful functions that will allow you to create an array with segments of a pandas dataframe based on multiple criteria.
SlidingWindow (window_len:int, stride:Optional[int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType,int,list]=None, get_y:Union[NoneType,int,list]=None, y_func:Optional[<built-infunctioncallable>]=None, output_processor:Optional[<built- infunctioncallable>]=None, copy:bool=False, horizon:Union[int,list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True)
Applies a sliding window to a 1d or 2d input (np.ndarray, torch.Tensor or pd.DataFrame)
Input:
You can use np.ndarray, pd.DataFrame or torch.Tensor as input
shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)
Type | Default | Details | |
---|---|---|---|
window_len | int | length of lookback window | |
stride | Union[None, int] | 1 | n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) |
start | int | 0 | determines the step where the first window is applied: 0 (default) or a given step (int). Previous steps will be discarded. |
pad_remainder | bool | False | allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). |
padding | str | post | ‘pre’ or ‘post’ (optional, defaults to ‘pre’): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence (‘pre’ from the end, and ‘post’ from the beginning) |
padding_value | float | nan | value (float) that will be used for padding. Default: np.nan |
add_padding_feature | bool | True | add an additional feature indicating whether each timestep is padded (1) or not (0). |
get_x | Union[None, int, list] | None | indices of columns that contain the independent variable (xs). If None, all data will be used as x. |
get_y | Union[None, int, list] | None | indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data). |
y_func | Optional[callable] | None | optional function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1! |
output_processor | Optional[callable] | None | optional function to process the final output (X (and y if available)). This is useful when some values need to be removed.The function should take X and y (even if it’s None) as arguments. |
copy | bool | False | copy the original object to avoid changes in it. |
horizon | Union[int, list] | 1 | number of future datapoints to predict (y). If get_y is [] horizon will be set to 0. |
seq_first | bool | True | True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len) |
sort_by | Optional[list] | None | column/s used for sorting the array in ascending order |
ascending | bool | True | used in sorting |
check_leakage | bool | True | checks if there’s leakage in the output between X and y |
wl = 5
stride = 5
t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X
input shape: (13, 3)
array([[[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., 3., 4.],
[ 0., 0., 0., 0., 0.]],
[[ 5., 6., 7., 8., 9.],
[ 5., 6., 7., 8., 9.],
[ 5., 6., 7., 8., 9.],
[ 0., 0., 0., 0., 0.]],
[[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[ 0., 0., 0., 1., 1.]]])
wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)
input shape: (10,)
(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]
wl = 5
h = 1
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]
wl = 5
h = 2 # 2 or more
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
wl = 5
h = 2 # 2 or more
t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
wl = 5
h = 2 # 2 or more
t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]
wl = 5
t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]
wl = 5
start = 3
t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8, 9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]
wl = 5
t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
var | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
5 | 5 |
6 | 6 |
7 | 7 |
8 | 8 |
9 | 9 |
10 | 10 |
11 | 11 |
12 | 12 |
13 | 13 |
14 | 14 |
15 | 15 |
16 | 16 |
17 | 17 |
18 | 18 |
19 | 19 |
wl = 5
t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6, 7, 8, 9, 10]]), 11), (array([[ 7, 8, 9, 10, 11]]), 12), (array([[ 8, 9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]
var | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
5 | 5 |
6 | 6 |
7 | 7 |
8 | 8 |
9 | 9 |
10 | 10 |
11 | 11 |
12 | 12 |
13 | 13 |
14 | 14 |
15 | 15 |
16 | 16 |
17 | 17 |
18 | 18 |
19 | 19 |
wl = 5
t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())
input shape: (20,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
var | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40],
[ 0, 100, 200, 300, 400]]), array([ 5, 50, 500])), (array([[ 1, 2, 3, 4, 5],
[ 10, 20, 30, 40, 50],
[100, 200, 300, 400, 500]]), array([ 6, 60, 600])), (array([[ 2, 3, 4, 5, 6],
[ 20, 30, 40, 50, 60],
[200, 300, 400, 500, 600]]), array([ 7, 70, 700])), (array([[ 3, 4, 5, 6, 7],
[ 30, 40, 50, 60, 70],
[300, 400, 500, 600, 700]]), array([ 8, 80, 800])), (array([[ 4, 5, 6, 7, 8],
[ 40, 50, 60, 70, 80],
[400, 500, 600, 700, 800]]), array([ 9, 90, 900]))]
var_0 | var_1 | var_2 | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 10 | 100 |
2 | 2 | 20 | 200 |
3 | 3 | 30 | 300 |
4 | 4 | 40 | 400 |
5 | 5 | 50 | 500 |
6 | 6 | 60 | 600 |
7 | 7 | 70 | 700 |
8 | 8 | 80 | 800 |
9 | 9 | 90 | 900 |
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y="var_0")(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))
input shape: torch.Size([10, 3])
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40],
[ 0, 100, 200, 300, 400]]), 5), (array([[ 1, 2, 3, 4, 5],
[ 10, 20, 30, 40, 50],
[100, 200, 300, 400, 500]]), 6), (array([[ 2, 3, 4, 5, 6],
[ 20, 30, 40, 50, 60],
[200, 300, 400, 500, 600]]), 7), (array([[ 3, 4, 5, 6, 7],
[ 30, 40, 50, 60, 70],
[300, 400, 500, 600, 700]]), 8), (array([[ 4, 5, 6, 7, 8],
[ 40, 50, 60, 70, 80],
[400, 500, 600, 700, 800]]), 9)]
var_0 | var_1 | var_2 | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 10 | 100 |
2 | 2 | 20 | 200 |
3 | 3 | 30 | 300 |
4 | 4 | 40 | 400 |
5 | 5 | 50 | 500 |
6 | 6 | 60 | 600 |
7 | 7 | 70 | 700 |
8 | 8 | 80 | 800 |
9 | 9 | 90 | 900 |
wl = 5
n_vars = 3
t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())
input shape: torch.Size([10, 3])
[(array([[ 0, 1, 2, 3, 4],
[ 0, 10, 20, 30, 40]]), 500), (array([[ 1, 2, 3, 4, 5],
[10, 20, 30, 40, 50]]), 600), (array([[ 2, 3, 4, 5, 6],
[20, 30, 40, 50, 60]]), 700), (array([[ 3, 4, 5, 6, 7],
[30, 40, 50, 60, 70]]), 800), (array([[ 4, 5, 6, 7, 8],
[40, 50, 60, 70, 80]]), 900)]
var_0 | var_1 | target | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 1 | 10 | 100 |
2 | 2 | 20 | 200 |
3 | 3 | 30 | 300 |
4 | 4 | 40 | 400 |
5 | 5 | 50 | 500 |
6 | 6 | 60 | 600 |
7 | 7 | 70 | 700 |
8 | 8 | 80 | 800 |
9 | 9 | 90 | 900 |
n_vars = 3
t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(1000, 3)
(200, 2, 5) (200,)
wl = 5
n_vars = 3
t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 3)
(96, 2, 5) (96,)
var_0 | var_1 | target | |
---|---|---|---|
0 | 0.154072 | 0.197194 | -0.083179 |
1 | 0.402744 | -0.248788 | -0.560573 |
2 | 0.448209 | 0.224215 | -0.681264 |
3 | 0.631502 | 0.406760 | -1.162043 |
4 | 1.099973 | 0.179926 | -0.712690 |
... | ... | ... | ... |
95 | -0.405079 | 3.662311 | -2.779159 |
96 | -0.445625 | 3.488809 | -2.663381 |
97 | -0.187349 | 3.304898 | -2.695971 |
98 | -0.100652 | 3.505663 | -2.590652 |
99 | 0.371272 | 3.279901 | -2.764369 |
100 rows × 3 columns
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
(96, 4, 5) (96,)
var_0 | var_1 | var_2 | var_3 | target | |
---|---|---|---|---|---|
0 | 0.443639 | -0.288128 | -0.049732 | 0.288915 | 0.325872 |
1 | -0.047608 | -0.009738 | 0.056768 | 0.541395 | 0.017496 |
2 | -0.243972 | 0.102227 | 0.361387 | 0.628397 | 0.049012 |
3 | -0.721266 | 0.045104 | 0.724062 | 0.940693 | 0.510875 |
4 | -0.641269 | 0.141927 | 0.793837 | 1.158903 | 0.417040 |
... | ... | ... | ... | ... | ... |
95 | 3.488117 | 2.345512 | 0.745483 | 0.258568 | 2.468550 |
96 | 3.187006 | 1.945844 | 0.833228 | 0.511198 | 2.115330 |
97 | 3.019862 | 1.739802 | 0.488732 | 0.881324 | 2.387837 |
98 | 3.314247 | 1.992000 | 0.119230 | 0.797794 | 2.327720 |
99 | 3.394578 | 2.012458 | 0.003244 | 0.387125 | 2.345970 |
100 rows × 5 columns
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
(96, 4, 5) (96,)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
var_0 | -0.407162 | -0.742169 | -1.193053 | -1.058644 | -0.721243 | -1.056788 | -1.316226 | -1.247859 | -1.391482 | -1.258618 | ... | -2.847911 | -3.118643 | -3.444248 | -3.036050 | -2.664068 | -2.473782 | -2.508080 | -2.878210 | -2.841170 | -2.688932 |
var_1 | 0.111643 | -0.286318 | -0.221917 | -0.026094 | -0.332200 | -0.376518 | -0.144763 | 0.225361 | 0.487134 | 0.435856 | ... | 1.569158 | 1.294548 | 1.564455 | 1.501243 | 1.490928 | 1.450602 | 1.440730 | 1.755607 | 1.380986 | 1.236284 |
var_2 | -0.126951 | -0.484267 | -0.480375 | -0.706987 | -0.571379 | -0.561959 | -0.717696 | -0.586035 | -0.298053 | -0.047405 | ... | -1.748096 | -1.508691 | -1.158258 | -1.116485 | -1.153738 | -1.575450 | -1.875091 | -1.613255 | -1.274859 | -1.592096 |
var_3 | -0.462238 | -0.748774 | -0.625473 | -0.360442 | -0.789178 | -0.530832 | -0.785290 | -0.413452 | 0.083685 | -0.110964 | ... | -4.873450 | -4.382297 | -4.531454 | -4.087051 | -4.087801 | -4.391084 | -4.262526 | -4.650170 | -4.465874 | -4.535273 |
target | 0.241454 | 0.084139 | -0.012974 | 0.096328 | 0.501035 | 0.697043 | 0.229185 | 0.497430 | 0.552922 | 0.218345 | ... | -4.582426 | -4.194067 | -3.785398 | -3.808516 | -3.629740 | -3.398645 | -3.828007 | -3.600028 | -3.614195 | -3.592783 |
5 rows × 100 columns
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)
(100, 5)
(20, 4, 5) (20,)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
var_0 | 0.210943 | -0.264863 | -0.307942 | 0.176782 | -0.188244 | 0.118824 | 0.593353 | 0.611408 | 0.176396 | 0.566034 | ... | -4.738294 | -5.138743 | -5.203979 | -4.835758 | -4.534974 | -4.310112 | -4.366365 | -4.328250 | -4.527717 | -4.432726 |
var_1 | -0.086375 | -0.457413 | 0.025571 | 0.428256 | 0.611573 | 0.319714 | -0.085129 | 0.161735 | 0.052730 | -0.356617 | ... | 7.203539 | 7.300534 | 7.267954 | 6.838923 | 7.054134 | 6.612532 | 7.108269 | 6.966000 | 7.407915 | 7.332567 |
var_2 | 0.166139 | -0.231839 | -0.468804 | -0.565628 | -0.500941 | -0.706951 | -0.881385 | -1.138549 | -0.978276 | -0.952727 | ... | 0.391942 | 0.802356 | 0.395688 | 0.033288 | 0.147283 | 0.589911 | 0.360847 | 0.322019 | 0.478120 | 0.278228 |
var_3 | -0.234297 | -0.467480 | -0.925036 | -0.572783 | -0.345585 | 0.149537 | -0.078098 | -0.577732 | -0.771975 | -0.322283 | ... | -1.487032 | -1.971348 | -2.300616 | -2.767312 | -2.657974 | -2.880908 | -2.567235 | -2.758240 | -2.605518 | -2.166444 |
target | -0.416187 | -0.164800 | -0.283554 | -0.534897 | -0.896808 | -0.456572 | -0.889556 | -1.178456 | -0.877891 | -1.176442 | ... | -6.094650 | -6.510793 | -6.408799 | -6.685696 | -6.672726 | -6.210781 | -6.377436 | -5.974001 | -5.755187 | -5.608240 |
5 rows × 100 columns
seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits
(100, 5)
var_0 | var_1 | var_2 | var_3 | target | |
---|---|---|---|---|---|
0 | 0.123248 | -0.081596 | 0.099444 | 0.447980 | -0.397975 |
1 | 0.469671 | -0.334499 | 0.307867 | 0.141345 | -0.131085 |
2 | 0.522902 | -0.696817 | 0.386597 | 0.156818 | 0.128043 |
3 | 0.487025 | -0.966153 | -0.050574 | -0.248479 | -0.088962 |
4 | 0.396284 | -1.319821 | -0.113121 | -0.379227 | 0.313690 |
... | ... | ... | ... | ... | ... |
95 | 6.138836 | -1.602917 | 1.713049 | 1.421797 | -1.873899 |
96 | 5.892472 | -1.896914 | 1.401137 | 1.065859 | -2.239942 |
97 | 5.421917 | -1.728568 | 1.481270 | 0.998533 | -2.157474 |
98 | 5.763120 | -1.404330 | 1.931361 | 1.295956 | -1.934397 |
99 | 5.827842 | -1.762438 | 1.831712 | 1.014259 | -1.831573 |
100 rows × 5 columns
((96, 4, 5),
(96,),
((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))
data = np.concatenate([np.linspace(0, 1, 11).reshape(-1,1).repeat(2, 1), np.arange(11).reshape(-1,1)], -1)
df_test = pd.DataFrame(data, columns=['col1', 'col2', 'target'])
df_test['target'] = df_test['target'].astype(int)
df_test
col1 | col2 | target | |
---|---|---|---|
0 | 0.0 | 0.0 | 0 |
1 | 0.1 | 0.1 | 1 |
2 | 0.2 | 0.2 | 2 |
3 | 0.3 | 0.3 | 3 |
4 | 0.4 | 0.4 | 4 |
5 | 0.5 | 0.5 | 5 |
6 | 0.6 | 0.6 | 6 |
7 | 0.7 | 0.7 | 7 |
8 | 0.8 | 0.8 | 8 |
9 | 0.9 | 0.9 | 9 |
10 | 1.0 | 1.0 | 10 |
df = pd.DataFrame()
df['sample_id'] = np.concatenate([np.ones(n)*(i + 1) for i,n in enumerate([13])])
df['var1'] = df['sample_id'] + df.index.values - 1
df['var2'] = df['var1'] * 10
df['target'] = (df['var1']).astype(int)
df['sample_id'] = df['sample_id'].astype(int)
df
sample_id | var1 | var2 | target | |
---|---|---|---|---|
0 | 1 | 0.0 | 0.0 | 0 |
1 | 1 | 1.0 | 10.0 | 1 |
2 | 1 | 2.0 | 20.0 | 2 |
3 | 1 | 3.0 | 30.0 | 3 |
4 | 1 | 4.0 | 40.0 | 4 |
5 | 1 | 5.0 | 50.0 | 5 |
6 | 1 | 6.0 | 60.0 | 6 |
7 | 1 | 7.0 | 70.0 | 7 |
8 | 1 | 8.0 | 80.0 | 8 |
9 | 1 | 9.0 | 90.0 | 9 |
10 | 1 | 10.0 | 100.0 | 10 |
11 | 1 | 11.0 | 110.0 | 11 |
12 | 1 | 12.0 | 120.0 | 12 |
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 4., 5., 6.],
[40., 50., 60.]],
[[ 6., 7., 8.],
[60., 70., 80.]]]),
array([[ 7, 8, 9, 10],
[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[nan, 3., 4.],
[nan, 30., 40.]],
[[ 4., 5., 6.],
[40., 50., 60.]],
[[ 6., 7., 8.],
[60., 70., 80.]]]),
array([[ 5, 6, 7, 8],
[ 7, 8, 9, 10],
[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=False, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (2, 2, 3))
test_eq(y.shape, (2, 4))
X, y
(array([[[ 3., 4., 5.],
[30., 40., 50.]],
[[ 5., 6., 7.],
[50., 60., 70.]]]),
array([[ 6, 7, 8, 9],
[ 8, 9, 10, 11]]))
X, y = SlidingWindow(window_len=3, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (3, 2, 3))
test_eq(y.shape, (3, 4))
X, y
(array([[[ 3., 4., 5.],
[30., 40., 50.]],
[[ 5., 6., 7.],
[50., 60., 70.]],
[[ 7., 8., 9.],
[70., 80., 90.]]]),
array([[ 6., 7., 8., 9.],
[ 8., 9., 10., 11.],
[10., 11., 12., nan]]))
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="pre", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[nan, nan, nan, nan, 3., 4., 5., 6., 7., 8.],
[nan, nan, nan, nan, 30., 40., 50., 60., 70., 80.]]]),
array([[ 9, 10, 11, 12]]))
X, y = SlidingWindow(window_len=10, stride=2, start=3, pad_remainder=True, padding="post", padding_value=np.nan, add_padding_feature=False,
get_x=["var1", "var2"], get_y=["target"], y_func=None, output_processor=None, copy=False, horizon=4, seq_first=True, sort_by=None,
ascending=True, check_leakage=True)(df)
test_eq(X.shape, (1, 2, 10))
test_eq(y.shape, (1, 4))
X, y
(array([[[ 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.],
[ 30., 40., 50., 60., 70., 80., 90., 100., 110., 120.]]]),
array([[nan, nan, nan, nan]]))
SlidingWindowPanel (window_len:int, unique_id_cols:list, stride:Optional[int]=1, start:int=0, pad_remainder:bool=False, padding:str='post', padding_value:float=nan, add_padding_feature:bool=True, get_x:Union[NoneType,int,list]=None, get_y:Union[NoneType,int,list]=None, y_func:Optional[<built-infunctioncallable>]=None, output_processor:Optional[<built- infunctioncallable>]=None, copy:bool=False, horizon:Union[int,list]=1, seq_first:bool=True, sort_by:Optional[list]=None, ascending:bool=True, check_leakage:bool=True, return_key:bool=False, verbose:bool=True)
Applies a sliding window to a pd.DataFrame.
Args: window_len = length of lookback window unique_id_cols = pd.DataFrame columns that will be used to identify a time series for each entity. stride = n datapoints the window is moved ahead along the sequence. Default: 1. If None, stride=window_len (no overlap) start = determines the step where the first window is applied: 0 (default), a given step (int), or random within the 1st stride (None). pad_remainder = allows to pad remainder subsequences when the sliding window is applied and get_y == [] (unlabeled data). padding = ‘pre’ or ‘post’ (optional, defaults to ‘pre’): pad either before or after each sequence. If pad_remainder == False, it indicates the starting point to create the sequence (‘pre’ from the end, and ‘post’ from the beginning) padding_value = value (float) that will be used for padding. Default: np.nan add_padding_feature = add an additional feature indicating whether each timestep is padded (1) or not (0). horizon = number of future datapoints to predict (y). If get_y is [] horizon will be set to 0. * 0 for last step in each sub-window. * n > 0 for a range of n future steps (1 to n). * n < 0 for a range of n past steps (-n + 1 to 0). * list : for those exact timesteps. get_x = indices of columns that contain the independent variable (xs). If None, all data will be used as x. get_y = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data). y_func = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1! output_processor = optional function to filter output (X (and y if available)). This is useful when some values need to be removed. The function should take X and y (even if it’s None) as arguments. copy = copy the original object to avoid changes in it. seq_first = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len) sort_by = column/s used for sorting the array in ascending order ascending = used in sorting check_leakage = checks if there’s leakage in the output between X and y return_key = when True, the key corresponsing to unique_id_cols for each sample is returned verbose = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.
Input: You can use np.ndarray, pd.DataFrame or torch.Tensor as input shape: (seq_len, ) or (seq_len, n_vars) if seq_first=True else (n_vars, seq_len)
samples = 100_000
wl = 5
n_vars = 10
t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = pd.concat([df, df2, df3], ignore_index=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape
var_0 | var_1 | var_2 | var_3 | var_4 | var_5 | var_6 | var_7 | var_8 | var_9 | time | device | target | region | index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 86008 | 860080 | 8600800 | 86008000 | 860080000 | 8600800000 | 86008000000 | 860080000000 | 8600800000000 | 86008000000000 | 86008 | 0 | 0 | A | 86008 |
1 | 90003 | 900012 | 9000102 | 90001000 | 900010000 | 9000100000 | 90001000000 | 900010000000 | 9000100000000 | 90001000000000 | 90001 | 2 | 2 | B | 190005 |
2 | 43819 | 438172 | 4381702 | 43817000 | 438170000 | 4381700000 | 43817000000 | 438170000000 | 4381700000000 | 43817000000000 | 43817 | 2 | 3 | B | 143821 |
3 | 80751 | 807492 | 8074902 | 80749000 | 807490000 | 8074900000 | 80749000000 | 807490000000 | 8074900000000 | 80749000000000 | 80749 | 2 | 3 | B | 180753 |
4 | 84917 | 849152 | 8491502 | 84915000 | 849150000 | 8491500000 | 84915000000 | 849150000000 | 8491500000000 | 84915000000000 | 84915 | 2 | 3 | B | 184919 |
(200004, 15)
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'],
horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'],
horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,), (199992,))
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'],
horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199992, 10, 5), (199992,))
X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'],
y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape
processing data...
...data processed
concatenating X...
...X concatenated
concatenating y...
...y concatenated
((199982, 10, 5), (199982,))
identify_padding (float_mask, value=-1)
Identifies padded subsequences in a mask of type float
This function identifies as padded subsequences those where all values == nan from the end of the sequence (last dimension) across all channels, and sets those values to the selected value (default = -1)
Args: mask: boolean or float mask value: scalar that will be used to identify padded subsequences
wl = 5
stride = 5
t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())
input shape: (13, 3)
tensor([[[ 0., 1., 2., 3., 4.],
[ 0., 1., 2., nan, nan],
[ 0., 1., 2., 3., 4.],
[ 0., 0., 0., 0., 0.]],
[[nan, nan, nan, 8., 9.],
[nan, nan, nan, 8., 9.],
[nan, nan, nan, 8., 9.],
[nan, nan, nan, 0., 0.]],
[[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[10., 11., 12., nan, nan],
[ 0., 0., 0., 1., 1.]]])
tensor([[[0., 0., 0., 0., 0.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.]],
[[0., 0., 0., 1., 1.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 0., 0.]]])
basic_data_preparation_fn (df, drop_duplicates=True, datetime_col=None, use_index=False, keep='last', add_missing_datetimes=True, freq='1D', method=None, sort_by=None)
Type | Default | Details | |
---|---|---|---|
df | dataframe to preprocess | ||
drop_duplicates | bool | True | flag to indicate if rows with duplicate datetime info should be removed |
datetime_col | NoneType | None | str indicating the name of the column/s that contains the datetime info |
use_index | bool | False | flag to indicate if the datetime info is in the index |
keep | str | last | str to indicate what data should be kept in case of duplicate rows |
add_missing_datetimes | bool | True | flaf to indicate if missing datetimes should be added |
freq | str | 1D | str to indicate the frequency used in the datetime info. Used in case missing timestamps exists |
method | NoneType | None | str indicating the method used to fill data for missing timestamps: None, ‘bfill’, ‘ffill’ |
sort_by | NoneType | None | str or list of str to indicate if how to sort data. If use_index=True the index will be used to sort the dataframe. |
df_len = 100
datetime_col = 'datetime'
df = pd.DataFrame(np.arange(df_len), columns=['value'])
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
df['type'] = 1
# drop 10 rows at random
df = df.drop(df.sample(10).index)
# add 2 duplicated rows
df = pd.concat([df, df.sample(2)])
display(df)
new_df = basic_data_preparation_fn(df, drop_duplicates=True, datetime_col=datetime_col, use_index=False, keep='last',
add_missing_datetimes=True, freq='1D', method='ffill', sort_by=datetime_col)
display(new_df)
value | datetime | type | |
---|---|---|---|
0 | 0 | 1749-03-31 | 1 |
1 | 1 | 1749-04-01 | 1 |
3 | 3 | 1749-04-03 | 1 |
4 | 4 | 1749-04-04 | 1 |
5 | 5 | 1749-04-05 | 1 |
... | ... | ... | ... |
96 | 96 | 1749-07-05 | 1 |
97 | 97 | 1749-07-06 | 1 |
99 | 99 | 1749-07-08 | 1 |
0 | 0 | 1749-03-31 | 1 |
19 | 19 | 1749-04-19 | 1 |
92 rows × 3 columns
value | datetime | type | |
---|---|---|---|
0 | 0 | 1749-03-31 | 1 |
1 | 1 | 1749-04-01 | 1 |
2 | 1 | 1749-04-02 | 1 |
3 | 3 | 1749-04-03 | 1 |
4 | 4 | 1749-04-04 | 1 |
... | ... | ... | ... |
95 | 95 | 1749-07-04 | 1 |
96 | 96 | 1749-07-05 | 1 |
97 | 97 | 1749-07-06 | 1 |
98 | 97 | 1749-07-07 | 1 |
99 | 99 | 1749-07-08 | 1 |
100 rows × 3 columns
check_safe_conversion (o, dtype='float32', cols=None)
Checks if the conversion to float is safe
assert check_safe_conversion(-2**11, 'float16') == True
assert check_safe_conversion(-2**11 - 1, 'float16') == False
assert check_safe_conversion(2**24, 'float32') == True
assert check_safe_conversion(2**24+1, 'float32') == False
assert check_safe_conversion(2**53, 'float64') == True
assert check_safe_conversion(2**53+1, 'float64') == False
df = pd.DataFrame({'a': [1, 2, 3], 'b': [2**24, 2**24+1, 2**24+2]})
assert not check_safe_conversion(df, 'float32')
assert check_safe_conversion(df, 'int32')
assert check_safe_conversion(df, 'float32', cols='a')
assert not check_safe_conversion(df, 'float32', cols='b')
-2147483648 1 3 2147483647
-2147483648 16777216 16777218 2147483647
/var/folders/42/4hhwknbd5kzcbq48tmy_gbp00000gn/T/ipykernel_30986/657350933.py:39: UserWarning: Unsafe conversion to float32: {'a': True, 'b': False}
warnings.warn(f"Unsafe conversion to {dtype}: {dict(zip(cols, checks))}")
/var/folders/42/4hhwknbd5kzcbq48tmy_gbp00000gn/T/ipykernel_30986/657350933.py:39: UserWarning: Unsafe conversion to float32: {'b': False}
warnings.warn(f"Unsafe conversion to {dtype}: {dict(zip(cols, checks))}")
fcst_history = 10
fcst_horizon = 5
stride = 1
valid_size=0.2
test_size=0.2
df = pd.DataFrame()
df['target'] = np.arange(50)
X, y = prepare_forecasting_data(df, fcst_history, fcst_horizon)
splits = get_forecasting_splits(df, fcst_history, fcst_horizon, valid_size=valid_size, test_size=test_size, stride=stride, show_plot=False)
assert y[splits[0]][-1][0][-1] == y[splits[1]][0][0][0] - stride
assert y[splits[1]][-1][0][-1] == y[splits[2]][0][0][0] - stride
for s,t in zip(splits, ['\ntrain_split:', '\nvalid_split:', '\ntest_split :']):
print(t)
for xi, yi in zip(X[s], y[s]):
print(xi, yi)
train_split:
[[0 1 2 3 4 5 6 7 8 9]] [[10 11 12 13 14]]
[[ 1 2 3 4 5 6 7 8 9 10]] [[11 12 13 14 15]]
[[ 2 3 4 5 6 7 8 9 10 11]] [[12 13 14 15 16]]
[[ 3 4 5 6 7 8 9 10 11 12]] [[13 14 15 16 17]]
[[ 4 5 6 7 8 9 10 11 12 13]] [[14 15 16 17 18]]
[[ 5 6 7 8 9 10 11 12 13 14]] [[15 16 17 18 19]]
[[ 6 7 8 9 10 11 12 13 14 15]] [[16 17 18 19 20]]
[[ 7 8 9 10 11 12 13 14 15 16]] [[17 18 19 20 21]]
[[ 8 9 10 11 12 13 14 15 16 17]] [[18 19 20 21 22]]
[[ 9 10 11 12 13 14 15 16 17 18]] [[19 20 21 22 23]]
[[10 11 12 13 14 15 16 17 18 19]] [[20 21 22 23 24]]
[[11 12 13 14 15 16 17 18 19 20]] [[21 22 23 24 25]]
[[12 13 14 15 16 17 18 19 20 21]] [[22 23 24 25 26]]
[[13 14 15 16 17 18 19 20 21 22]] [[23 24 25 26 27]]
[[14 15 16 17 18 19 20 21 22 23]] [[24 25 26 27 28]]
[[15 16 17 18 19 20 21 22 23 24]] [[25 26 27 28 29]]
valid_split:
[[20 21 22 23 24 25 26 27 28 29]] [[30 31 32 33 34]]
[[21 22 23 24 25 26 27 28 29 30]] [[31 32 33 34 35]]
[[22 23 24 25 26 27 28 29 30 31]] [[32 33 34 35 36]]
[[23 24 25 26 27 28 29 30 31 32]] [[33 34 35 36 37]]
[[24 25 26 27 28 29 30 31 32 33]] [[34 35 36 37 38]]
[[25 26 27 28 29 30 31 32 33 34]] [[35 36 37 38 39]]
test_split :
[[30 31 32 33 34 35 36 37 38 39]] [[40 41 42 43 44]]
[[31 32 33 34 35 36 37 38 39 40]] [[41 42 43 44 45]]
[[32 33 34 35 36 37 38 39 40 41]] [[42 43 44 45 46]]
[[33 34 35 36 37 38 39 40 41 42]] [[43 44 45 46 47]]
[[34 35 36 37 38 39 40 41 42 43]] [[44 45 46 47 48]]
[[35 36 37 38 39 40 41 42 43 44]] [[45 46 47 48 49]]
fcst_history = 10
fcst_horizon = 5
stride = 1
valid_size=0.2
test_size=0.2
df = pd.DataFrame()
df['target'] = np.arange(50)
X, y = prepare_forecasting_data(df, fcst_history, fcst_horizon, x_vars=None, y_vars=[])
splits = get_forecasting_splits(df, fcst_history, fcst_horizon, valid_size=valid_size, test_size=test_size, stride=stride, show_plot=False)
assert y is None
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**i).astype(np.float32)
display(df)
fcst_history = 10
fcst_horizon = 5
x_vars = df.columns
y_vars = None
dtype = None
X, y = prepare_forecasting_data(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars, dtype=dtype)
test_eq(X.shape, (86, 3, 10))
test_eq(y.shape, (86, 3, 5))
test_eq(y[:3, :, 0], X[:3, :, -1] + np.array([1, 10, 100]).reshape(1, 1, -1))
print(X[:3].astype(int))
print(y[:3].astype(int))
value_0 | value_1 | value_2 | |
---|---|---|---|
0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 10.0 | 100.0 |
2 | 2.0 | 20.0 | 200.0 |
3 | 3.0 | 30.0 | 300.0 |
4 | 4.0 | 40.0 | 400.0 |
... | ... | ... | ... |
95 | 95.0 | 950.0 | 9500.0 |
96 | 96.0 | 960.0 | 9600.0 |
97 | 97.0 | 970.0 | 9700.0 |
98 | 98.0 | 980.0 | 9800.0 |
99 | 99.0 | 990.0 | 9900.0 |
100 rows × 3 columns
[[[ 0 1 2 3 4 5 6 7 8 9]
[ 0 10 20 30 40 50 60 70 80 90]
[ 0 100 200 300 400 500 600 700 800 900]]
[[ 1 2 3 4 5 6 7 8 9 10]
[ 10 20 30 40 50 60 70 80 90 100]
[ 100 200 300 400 500 600 700 800 900 1000]]
[[ 2 3 4 5 6 7 8 9 10 11]
[ 20 30 40 50 60 70 80 90 100 110]
[ 200 300 400 500 600 700 800 900 1000 1100]]]
[[[ 10 11 12 13 14]
[ 100 110 120 130 140]
[1000 1100 1200 1300 1400]]
[[ 11 12 13 14 15]
[ 110 120 130 140 150]
[1100 1200 1300 1400 1500]]
[[ 12 13 14 15 16]
[ 120 130 140 150 160]
[1200 1300 1400 1500 1600]]]
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
df['type'] = np.random.randint(0, 4, df_len)
df['target'] = np.arange(df_len)
display(df)
fcst_history = 10
fcst_horizon = 5
x_vars = ['value_0', 'value_1', 'value_2', 'target']
y_vars = 'target'
dtype = np.float32
X, y = prepare_forecasting_data(df, fcst_history=fcst_history, fcst_horizon=fcst_horizon, x_vars=x_vars, y_vars=y_vars, dtype=dtype)
test_eq(X.shape, (86, 4, 10))
test_eq(y.shape, (86, 1, 5))
print(X[:3].astype(int))
print(y[:3])
value_0 | value_1 | value_2 | datetime | type | target | |
---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 1749-03-31 | 3 | 0 |
1 | 10.0 | 100.0 | 1000.0 | 1749-04-01 | 1 | 1 |
2 | 20.0 | 200.0 | 2000.0 | 1749-04-02 | 1 | 2 |
3 | 30.0 | 300.0 | 3000.0 | 1749-04-03 | 1 | 3 |
4 | 40.0 | 400.0 | 4000.0 | 1749-04-04 | 2 | 4 |
... | ... | ... | ... | ... | ... | ... |
95 | 950.0 | 9500.0 | 95000.0 | 1749-07-04 | 0 | 95 |
96 | 960.0 | 9600.0 | 96000.0 | 1749-07-05 | 0 | 96 |
97 | 970.0 | 9700.0 | 97000.0 | 1749-07-06 | 3 | 97 |
98 | 980.0 | 9800.0 | 98000.0 | 1749-07-07 | 2 | 98 |
99 | 990.0 | 9900.0 | 99000.0 | 1749-07-08 | 1 | 99 |
100 rows × 6 columns
[[[ 0 10 20 30 40 50 60 70 80 90]
[ 0 100 200 300 400 500 600 700 800 900]
[ 0 1000 2000 3000 4000 5000 6000 7000 8000 9000]
[ 0 1 2 3 4 5 6 7 8 9]]
[[ 10 20 30 40 50 60 70 80 90 100]
[ 100 200 300 400 500 600 700 800 900 1000]
[ 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000]
[ 1 2 3 4 5 6 7 8 9 10]]
[[ 20 30 40 50 60 70 80 90 100 110]
[ 200 300 400 500 600 700 800 900 1000 1100]
[ 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000]
[ 2 3 4 5 6 7 8 9 10 11]]]
[[[10. 11. 12. 13. 14.]]
[[11. 12. 13. 14. 15.]]
[[12. 13. 14. 15. 16.]]]
get_today (datetime_format='%Y-%m-%d')
split_fcst_datetime (fcst_datetime)
Define fcst start and end dates
Details | |
---|---|
fcst_datetime | str or list of str with datetime |
set_df_datetime (df, datetime_col=None, use_index=False)
Make sure datetime column or index is of the right date type.
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
set_df_datetime(df, datetime_col=datetime_col)
test_eq(df['datetime'].dtypes, np.dtype('datetime64[ns]'))
df_index = df.set_index('datetime')
set_df_datetime(df_index, use_index=True)
test_eq(df_index.index.dtype, np.dtype('datetime64[ns]'))
get_df_datetime_bounds (df, datetime_col=None, use_index=False)
Returns the start date and and dates used by the forecast
Type | Default | Details | |
---|---|---|---|
df | dataframe containing forecasting data | ||
datetime_col | NoneType | None | str data column containing the datetime |
use_index | bool | False | bool flag to indicate if index should be used to get column |
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
df['datetime'] = pd.date_range(pd.to_datetime('1749-03-31'), periods=df_len, freq='1D')
test_eq(get_df_datetime_bounds(df, datetime_col=datetime_col), (df['datetime'].min(), df['datetime'].max()))
df_index = df.set_index('datetime')
test_eq(get_df_datetime_bounds(df_index, use_index=True), (df_index.index.min(), df_index.index.max()))
get_fcst_bounds (df, fcst_datetime, fcst_history=None, fcst_horizon=None, freq='D', datetime_format='%Y-%m-%d', datetime_col=None, use_index=False)
Returns the start and end datetimes used by the forecast
Type | Default | Details | |
---|---|---|---|
df | dataframe containing forecasting data | ||
fcst_datetime | datetime for which a fcst is created. Optionally tuple of datatimes if the fcst is created for a range of dates. | ||
fcst_history | NoneType | None | # steps used as input |
fcst_horizon | NoneType | None | # predicted steps |
freq | str | D | datetime units. May contain a letters only or a combination of ints + letters: eg. “7D” |
datetime_format | str | %Y-%m-%d | format used to convert “today” |
datetime_col | NoneType | None | str data column containing the datetime |
use_index | bool | False | bool flag to indicate if index should be used to get column |
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
today = pd.Timestamp(get_today()).floor(freq)
df['datetime'] = pd.date_range(None, today, periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = max_dt - timedelta(weeks=fcst_horizon)
print('fcst_datetime :', fcst_datetime)
start_datetime, end_datetime = get_fcst_bounds(df, fcst_datetime, datetime_col=datetime_col, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq)
print('start_datetime:', start_datetime)
print('end_datetime :', end_datetime)
dates = pd.date_range(start_datetime, end_datetime, freq=freq)
print(dates)
test_eq(len(dates), fcst_history + fcst_horizon)
test_eq(end_datetime, max_dt)
value_0 | value_1 | value_2 | datetime | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 2021-11-25 |
1 | 10.0 | 100.0 | 1000.0 | 2021-12-02 |
2 | 20.0 | 200.0 | 2000.0 | 2021-12-09 |
3 | 30.0 | 300.0 | 3000.0 | 2021-12-16 |
4 | 40.0 | 400.0 | 4000.0 | 2021-12-23 |
... | ... | ... | ... | ... |
95 | 950.0 | 9500.0 | 95000.0 | 2023-09-21 |
96 | 960.0 | 9600.0 | 96000.0 | 2023-09-28 |
97 | 970.0 | 9700.0 | 97000.0 | 2023-10-05 |
98 | 980.0 | 9800.0 | 98000.0 | 2023-10-12 |
99 | 990.0 | 9900.0 | 99000.0 | 2023-10-19 |
100 rows × 4 columns
fcst_datetime : 2023-08-10 00:00:00
start_datetime: 2023-01-19 00:00:00
end_datetime : 2023-10-19 00:00:00
DatetimeIndex(['2023-01-19', '2023-01-26', '2023-02-02', '2023-02-09',
'2023-02-16', '2023-02-23', '2023-03-02', '2023-03-09',
'2023-03-16', '2023-03-23', '2023-03-30', '2023-04-06',
'2023-04-13', '2023-04-20', '2023-04-27', '2023-05-04',
'2023-05-11', '2023-05-18', '2023-05-25', '2023-06-01',
'2023-06-08', '2023-06-15', '2023-06-22', '2023-06-29',
'2023-07-06', '2023-07-13', '2023-07-20', '2023-07-27',
'2023-08-03', '2023-08-10', '2023-08-17', '2023-08-24',
'2023-08-31', '2023-09-07', '2023-09-14', '2023-09-21',
'2023-09-28', '2023-10-05', '2023-10-12', '2023-10-19'],
dtype='datetime64[ns]', freq='7D')
filter_df_by_datetime (df, start_datetime=None, end_datetime=None, datetime_col=None, use_index=False)
Type | Default | Details | |
---|---|---|---|
df | dataframe containing forecasting data | ||
start_datetime | NoneType | None | lower datetime bound |
end_datetime | NoneType | None | upper datetime bound |
datetime_col | NoneType | None | str data column containing the datetime |
use_index | bool | False | bool flag to indicate if index should be used to get column |
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
df['datetime'] = pd.date_range(None, pd.Timestamp(get_today()).floor(freq), periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = pd.date_range(end=fcst_datetime, periods=fcst_horizon + 1, freq=freq).floor(freq)[-1]
start_datetime, end_datetime = get_fcst_bounds(df, fcst_datetime, datetime_col=datetime_col, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq)
test_eq(len(filter_df_by_datetime(df, start_datetime=start_datetime, end_datetime=end_datetime, datetime_col=datetime_col)), fcst_history + fcst_horizon)
value_0 | value_1 | value_2 | datetime | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 2021-11-25 |
1 | 10.0 | 100.0 | 1000.0 | 2021-12-02 |
2 | 20.0 | 200.0 | 2000.0 | 2021-12-09 |
3 | 30.0 | 300.0 | 3000.0 | 2021-12-16 |
4 | 40.0 | 400.0 | 4000.0 | 2021-12-23 |
... | ... | ... | ... | ... |
95 | 950.0 | 9500.0 | 95000.0 | 2023-09-21 |
96 | 960.0 | 9600.0 | 96000.0 | 2023-09-28 |
97 | 970.0 | 9700.0 | 97000.0 | 2023-10-05 |
98 | 980.0 | 9800.0 | 98000.0 | 2023-10-12 |
99 | 990.0 | 9900.0 | 99000.0 | 2023-10-19 |
100 rows × 4 columns
get_fcst_data_from_df (df, fcst_datetime, fcst_history=None, fcst_horizon=None, freq='D', datetime_format='%Y-%m-%d', datetime_col=None, use_index=False)
Get forecasting data from a dataframe
Type | Default | Details | |
---|---|---|---|
df | dataframe containing forecasting data | ||
fcst_datetime | datetime for which a fcst is created. Optionally tuple of datatimes if the fcst is created for a range of dates. | ||
fcst_history | NoneType | None | # steps used as input |
fcst_horizon | NoneType | None | # predicted steps |
freq | str | D | datetime units. May contain a letters only or a combination of ints + letters: eg. “7D” |
datetime_format | str | %Y-%m-%d | format used to convert “today” |
datetime_col | NoneType | None | str data column containing the datetime |
use_index | bool | False | bool flag to indicate if index should be used to get column |
# Test
df_len = 100
n_values = 3
datetime_col = 'datetime'
df = pd.DataFrame()
for i in range(n_values):
df[f"value_{i}"] = (np.arange(df_len) * 10**(i + 1)).astype(np.float32)
freq = "7D"
df['datetime'] = pd.date_range(None, pd.Timestamp(get_today()).floor(freq), periods=df_len, freq=freq)
display(df)
max_dt = pd.Timestamp(df['datetime'].max()).floor(freq)
fcst_history = 30
fcst_horizon = 10
fcst_datetime = pd.date_range(end=fcst_datetime, periods=fcst_horizon + 1, freq=freq).floor(freq)[-1]
test_eq(len(get_fcst_data_from_df(df, fcst_datetime, fcst_history=fcst_history, fcst_horizon=fcst_horizon, freq=freq, datetime_col=datetime_col)),
fcst_history + fcst_horizon)
value_0 | value_1 | value_2 | datetime | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 2021-11-25 |
1 | 10.0 | 100.0 | 1000.0 | 2021-12-02 |
2 | 20.0 | 200.0 | 2000.0 | 2021-12-09 |
3 | 30.0 | 300.0 | 3000.0 | 2021-12-16 |
4 | 40.0 | 400.0 | 4000.0 | 2021-12-23 |
... | ... | ... | ... | ... |
95 | 950.0 | 9500.0 | 95000.0 | 2023-09-21 |
96 | 960.0 | 9600.0 | 96000.0 | 2023-09-28 |
97 | 970.0 | 9700.0 | 97000.0 | 2023-10-05 |
98 | 980.0 | 9800.0 | 98000.0 | 2023-10-12 |
99 | 990.0 | 9900.0 | 99000.0 | 2023-10-19 |
100 rows × 4 columns