External data

Helper functions used to download and extract common time series datasets.


source

decompress_from_url


def decompress_from_url(
    url, target_dir:NoneType=None, verbose:bool=False
):

Call self as a function.


source

download_data


def download_data(
    url, fname:NoneType=None, c_key:str='archive', force_download:bool=False, timeout:int=4, verbose:bool=False
):

Download url to fname.


source

get_UCR_univariate_list


def get_UCR_univariate_list(
    
):

Call self as a function.


source

get_UCR_multivariate_list


def get_UCR_multivariate_list(
    
):

Call self as a function.


source

get_UCR_data


def get_UCR_data(
    dsid, path:str='.', parent_dir:str='data/UCR', on_disk:bool=True, mode:str='c', Xdtype:str='float32',
    ydtype:NoneType=None, return_split:bool=True, split_data:bool=True, force_download:bool=False,
    verbose:bool=False
):

Call self as a function.

from fastai.data.transforms import get_files
PATH = Path('.')
dsids = ['ECGFiveDays', 'AtrialFibrillation'] # univariate and multivariate
for dsid in dsids:
    print(dsid)
    tgt_dir = PATH/f'data/UCR/{dsid}'
    if os.path.isdir(tgt_dir): shutil.rmtree(tgt_dir)
    test_eq(len(get_files(tgt_dir)), 0) # no file left
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid)
    test_eq(len(get_files(tgt_dir, '.npy')), 6)
    test_eq(len(get_files(tgt_dir, '.npy')), len(get_files(tgt_dir))) # test no left file/ dir
    del X_train, y_train, X_valid, y_valid
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid)
    test_eq(X_train.ndim, 3)
    test_eq(y_train.ndim, 1)
    test_eq(X_valid.ndim, 3)
    test_eq(y_valid.ndim, 1)
    test_eq(len(get_files(tgt_dir, '.npy')), 6)
    test_eq(len(get_files(tgt_dir, '.npy')), len(get_files(tgt_dir))) # test no left file/ dir
    test_eq(X_train.ndim, 3)
    test_eq(y_train.ndim, 1)
    test_eq(X_valid.ndim, 3)
    test_eq(y_valid.ndim, 1)
    test_eq(X_train.dtype, np.float32)
    test_eq(X_train.__class__.__name__, 'memmap')
    del X_train, y_train, X_valid, y_valid
    X_train, y_train, X_valid, y_valid = get_UCR_data(dsid, on_disk=False)
    test_eq(X_train.__class__.__name__, 'ndarray')
    del X_train, y_train, X_valid, y_valid
ECGFiveDays
AtrialFibrillation
X_train, y_train, X_valid, y_valid = get_UCR_data('natops')
dsid = 'natops'
X_train, y_train, X_valid, y_valid = get_UCR_data(dsid, verbose=True)
X, y, splits = get_UCR_data(dsid, split_data=False)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_eq(X[splits[0]], X_train)
test_eq(y[splits[1]], y_valid)
test_type(X, X_train)
test_type(y, y_train)
Dataset: NATOPS
X_train: (180, 24, 51)
y_train: (180,)
X_valid: (180, 24, 51)
y_valid: (180,) 

source

check_data


def check_data(
    X, y:NoneType=None, splits:NoneType=None, show_plot:bool=True
):

Call self as a function.

dsid = 'ECGFiveDays'
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y = y.astype(np.float32)
check_data(X, y, splits)
y[:10] = np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
splits = get_splits(y, 3)
check_data(X, y, splits)
check_data(X[:, 0], y, splits)
y[:5]= np.nan
check_data(X[:, 0], y, splits)
X, y, splits = get_UCR_data(dsid, split_data=False, on_disk=False, force_download=False)
X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False

X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 2 shape: [23, 861]  overlap: False

X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 0
splits - n_splits: 2 shape: [23, 861]  overlap: False

X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:float32  isnan: 10
splits - n_splits: 2 shape: [23, 861]  overlap: False
/var/folders/yw/1vck7tm93_z1z0bftrw65hbw0000gn/T/ipykernel_97892/3275116464.py:23: UserWarning: y contains nan values
  warnings.warn('y contains nan values')

X      - shape: [884 samples x 1 features x 136 timesteps]  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]

X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 2 (442 samples per class) ['1', '2']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]

X      - shape: (884, 136)  type: ndarray  dtype:float32  isnan: 0
y      - shape: (884,)  type: ndarray  dtype:<U1  n_classes: 3 (294 samples per class) ['1', '2', 'n']  isnan: False
splits - n_splits: 3 shape: [[589, 295], [589, 295], [590, 294]]  overlap: [False, False, False]


source

get_Monash_regression_list


def get_Monash_regression_list(
    
):

Call self as a function.


source

get_Monash_regression_data


def get_Monash_regression_data(
    dsid, path:str='./data/Monash', on_disk:bool=True, mode:str='c', Xdtype:str='float32', ydtype:NoneType=None,
    split_data:bool=True, force_download:bool=False, verbose:bool=False, timeout:int=4
):

Call self as a function.

dsid = "Covid3Month"
X_train, y_train, X_valid, y_valid = get_Monash_regression_data(dsid, on_disk=False, split_data=True, force_download=False)
X, y, splits = get_Monash_regression_data(dsid, on_disk=True, split_data=False, force_download=False, verbose=True)
if X_train is not None:
    test_eq(X_train.shape, (140, 1, 84))
if X is not None:
    test_eq(X.shape, (201, 1, 84))
Dataset: Covid3Month
X      : (201, 1, 84)
y      : (201,)
splits : [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(70), np.int64(71), np.int64(72), np.int64(73), np.int64(74), np.int64(75), np.int64(76), np.int64(77), np.int64(78), np.int64(79), np.int64(80), np.int64(81), np.int64(82), np.int64(83), np.int64(84), np.int64(85), np.int64(86), np.int64(87), np.int64(88), np.int64(89), np.int64(90), np.int64(91), np.int64(92), np.int64(93), np.int64(94), np.int64(95), np.int64(96), np.int64(97), np.int64(98), np.int64(99), np.int64(100), np.int64(101), np.int64(102), np.int64(103), np.int64(104), np.int64(105), np.int64(106), np.int64(107), np.int64(108), np.int64(109), np.int64(110), np.int64(111), np.int64(112), np.int64(113), np.int64(114), np.int64(115), np.int64(116), np.int64(117), np.int64(118), np.int64(119), np.int64(120), np.int64(121), np.int64(122), np.int64(123), np.int64(124), np.int64(125), np.int64(126), np.int64(127), np.int64(128), np.int64(129), np.int64(130), np.int64(131), np.int64(132), np.int64(133), np.int64(134), np.int64(135), np.int64(136), np.int64(137), np.int64(138), np.int64(139)] [np.int64(140), np.int64(141), np.int64(142), np.int64(143), np.int64(144), np.int64(145), np.int64(146), np.int64(147), np.int64(148), np.int64(149), np.int64(150), np.int64(151), np.int64(152), np.int64(153), np.int64(154), np.int64(155), np.int64(156), np.int64(157), np.int64(158), np.int64(159), np.int64(160), np.int64(161), np.int64(162), np.int64(163), np.int64(164), np.int64(165), np.int64(166), np.int64(167), np.int64(168), np.int64(169), np.int64(170), np.int64(171), np.int64(172), np.int64(173), np.int64(174), np.int64(175), np.int64(176), np.int64(177), np.int64(178), np.int64(179), np.int64(180), np.int64(181), np.int64(182), np.int64(183), np.int64(184), np.int64(185), np.int64(186), np.int64(187), np.int64(188), np.int64(189), np.int64(190), np.int64(191), np.int64(192), np.int64(193), np.int64(194), np.int64(195), np.int64(196), np.int64(197), np.int64(198), np.int64(199), np.int64(200)] 

source

get_forecasting_list


def get_forecasting_list(
    
):

Call self as a function.


source

get_forecasting_time_series


def get_forecasting_time_series(
    dsid, path:str='./data/forecasting/', force_download:bool=False, verbose:bool=True, kwargs:VAR_KEYWORD
):

Call self as a function.

ts = get_forecasting_time_series("sunspots", force_download=False)
test_eq(len(ts), 2820)
ts
Dataset: Sunspots
downloading data...
...done. Path = data/forecasting/Sunspots.csv
Sunspots
Month
1749-01-31 58.0
1749-02-28 62.6
1749-03-31 70.0
1749-04-30 55.7
1749-05-31 85.0
... ...
1983-08-31 71.8
1983-09-30 50.3
1983-10-31 55.8
1983-11-30 33.3
1983-12-31 33.4

2820 rows × 1 columns

ts = get_forecasting_time_series("weather", force_download=False)
if ts is not None:
    test_eq(len(ts), 70091)
    display(ts)
Dataset: Weather
downloading data...
...done. Path = data/forecasting/Weather.csv.zip
p (mbar) T (degC) Tpot (K) Tdew (degC) rh (%) VPmax (mbar) VPact (mbar) VPdef (mbar) sh (g/kg) H2OC (mmol/mol) rho (g/m**3) Wx Wy max Wx max Wy Day sin Day cos Year sin Year cos
0 996.50 -8.05 265.38 -8.78 94.40 3.33 3.14 0.19 1.96 3.15 1307.86 -0.204862 -0.046168 -0.614587 -0.138503 -1.776611e-12 1.000000 0.009332 0.999956
1 996.62 -8.88 264.54 -9.77 93.20 3.12 2.90 0.21 1.81 2.91 1312.25 -0.245971 -0.044701 -0.619848 -0.112645 2.588190e-01 0.965926 0.010049 0.999950
2 996.84 -8.81 264.59 -9.66 93.50 3.13 2.93 0.20 1.83 2.94 1312.18 -0.175527 0.039879 -0.614344 0.139576 5.000000e-01 0.866025 0.010766 0.999942
3 996.99 -9.05 264.34 -10.02 92.60 3.07 2.85 0.23 1.78 2.85 1313.61 -0.050000 -0.086603 -0.190000 -0.329090 7.071068e-01 0.707107 0.011483 0.999934
4 997.46 -9.63 263.72 -10.65 92.20 2.94 2.71 0.23 1.69 2.71 1317.19 -0.368202 0.156292 -0.810044 0.343843 8.660254e-01 0.500000 0.012199 0.999926
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
70086 1002.18 -0.98 272.01 -5.36 72.00 5.69 4.09 1.59 2.54 4.08 1280.70 -0.855154 -0.160038 -1.336792 -0.250174 -9.990482e-01 0.043619 0.006183 0.999981
70087 1001.40 -1.40 271.66 -6.84 66.29 5.51 3.65 1.86 2.27 3.65 1281.87 -0.716196 -0.726267 -1.348134 -1.367090 -9.537170e-01 0.300706 0.006900 0.999976
70088 1001.19 -2.75 270.32 -6.90 72.90 4.99 3.64 1.35 2.26 3.63 1288.02 -0.661501 0.257908 -1.453438 0.566672 -8.433914e-01 0.537300 0.007617 0.999971
70089 1000.65 -2.89 270.22 -7.15 72.30 4.93 3.57 1.37 2.22 3.57 1288.03 -0.280621 -0.209169 -0.545207 -0.406385 -6.755902e-01 0.737277 0.008334 0.999965
70090 1000.11 -3.93 269.23 -8.09 72.60 4.56 3.31 1.25 2.06 3.31 1292.41 -0.516998 -0.215205 -0.923210 -0.384295 -4.617486e-01 0.887011 0.009050 0.999959

70091 rows × 19 columns


source

convert_tsf_to_dataframe


def convert_tsf_to_dataframe(
    full_file_path_and_name, replace_missing_vals_with:str='NaN', value_column_name:str='series_value'
):

Call self as a function.


source

get_Monash_forecasting_data


def get_Monash_forecasting_data(
    dsid, path:str='./data/forecasting/', force_download:bool=False, remove_from_disk:bool=False,
    add_timestamp:bool=True, verbose:bool=True
):

Call self as a function.


source

get_fcst_horizon


def get_fcst_horizon(
    frequency, dsid
):

Call self as a function.


source

preprocess_Monash_df


def preprocess_Monash_df(
    df, frequency
):

Call self as a function.

dsid = 'covid_deaths_dataset'
df = get_Monash_forecasting_data(dsid, force_download=True)
if df is not None:
    test_eq(df.shape, (56392, 3))

source

download_all_long_term_forecasting_data


def download_all_long_term_forecasting_data(
    target_dir:str='./data/long_forecasting/', force_download:bool=False, remove_zip:bool=False, c_key:str='archive',
    timeout:int=4, verbose:bool=True
):

Call self as a function.


source

unzip_file


def unzip_file(
    file, target_dir
):

Call self as a function.


source

get_long_term_forecasting_data


def get_long_term_forecasting_data(
    dsid, # ID of the dataset to be used for long-term forecasting.
    target_dir:str='./data/long_forecasting/', # Directory where the long-term forecasting data will be saved.
    task:str='M', # 'M' for multivariate, 'S' for univariate and 'MS' for multivariate input with univariate output
    fcst_horizon:NoneType=None, # # historical steps used as input. If None, the default is applied.
    fcst_history:NoneType=None, # # steps forecasted into the future. If None, the minimum default is applied.
    preprocess:bool=True, # Flag that indicates whether if the data is preprocessed before saving.
    force_download:bool=False, # Flag that indicates if the data should be downloaded again even if directory exists.
    remove_zip:bool=False, # Flag that indicates if the zip file should be removed after extracting the data.
    return_df:bool=True, # Flag that indicates whether a dataframe (True) or X and and y arrays (False) are returned.
    show_plot:bool=True, # plot the splits
    dtype:type=float32, verbose:bool=True, # Flag tto indicate the verbosity.
    kwargs:VAR_KEYWORD
):

Downloads (and preprocess) a pandas dataframe with the requested long-term forecasting dataset

dsid = "ILI"
try:
    df = get_long_term_forecasting_data(dsid, target_dir='./data/forecasting/', force_download=False)
    print(f"{dsid:15}: {str(df.shape):15}")
    del df; gc.collect()
    remove_dir('./data/forecasting/', False)
except Exception as e:
    print(f"{dsid:15}: {str(e):15}")
113.45% [8192/7221 00:00<00:00]
ILI            : File is not a zip file
dsid = "ILI"
try:
    X, y, splits, stats = get_long_term_forecasting_data(dsid, target_dir='./data/forecasting/', force_download=False, return_df=False, show_plot=False)
    print(f"{dsid:15} -  X.shape: {str(X.shape):20}  y.shape: {str(y.shape):20}  splits: {str([len(s) for s in splits]):25}  \
stats: {str([s.shape for s in stats]):30}")
    del X, y, splits, stats
    gc.collect()
    remove_dir('./data/forecasting/', False)
except Exception as e:
    print(f"{dsid:15}: {str(e):15}")
113.45% [8192/7221 00:00<00:00]
ILI            : File is not a zip file