Mixed data

DataLoader than can take data from multiple dataloaders with different types of data


source

MixedDataLoaders


def MixedDataLoaders(
    loaders:VAR_POSITIONAL, # `DataLoader` objects to wrap
    path:str | Path='.', # Path to store export objects
    device:NoneType=None, # Device to put `DataLoaders`
):

Basic wrapper around several DataLoaders.


source

MixedDataLoader


def MixedDataLoader(
    loaders:VAR_POSITIONAL, path:str='.', shuffle:bool=False, device:NoneType=None, bs:NoneType=None
):

Accepts any number of DataLoader and a device


source

get_mixed_dls


def get_mixed_dls(
    dls:VAR_POSITIONAL, device:NoneType=None, shuffle_train:NoneType=None, shuffle_valid:NoneType=None,
    kwargs:VAR_KEYWORD
):

Call self as a function.

from tsai.data.tabular import *
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))

cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()

cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()
workclass education marital-status age fnlwgt salary
0 Private Some-college Never-married 22.000000 133832.998222 <50k
1 State-gov HS-grad Separated 46.000000 111163.001785 <50k
2 ? Masters Never-married 50.000000 22427.997140 <50k
3 Self-emp-not-inc HS-grad Married-civ-spouse 43.000000 38876.004820 <50k
4 Private 7th-8th Never-married 49.000000 240868.998836 <50k
5 ? HS-grad Widowed 73.000000 200878.000171 <50k
6 Private HS-grad Married-civ-spouse 38.000000 103323.001572 <50k
7 Private Bachelors Never-married 45.000000 51744.000875 <50k
8 Private HS-grad Divorced 28.000000 430084.000496 <50k
9 Self-emp-not-inc 7th-8th Married-civ-spouse 60.000001 127805.000397 >=50k
education-num_na education-num salary
0 False 9.0 <50k
1 False 9.0 <50k
2 False 13.0 <50k
3 False 10.0 <50k
4 False 5.0 <50k
5 False 10.0 <50k
6 False 2.0 <50k
7 False 8.0 <50k
8 False 6.0 >=50k
9 False 10.0 >=50k
dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth', weights_only=False)
dls.train.show_batch()
workclass education marital-status age fnlwgt salary
0 Self-emp-not-inc 7th-8th Married-civ-spouse 45.0 285335.001819 <50k
1 Private HS-grad Married-civ-spouse 32.0 178615.000066 <50k
2 Private Some-college Never-married 25.0 111058.002708 <50k
3 Private HS-grad Married-civ-spouse 57.0 178352.999653 >=50k
4 Private HS-grad Never-married 35.0 136343.002194 <50k
5 Private HS-grad Married-civ-spouse 53.0 104879.001541 >=50k
6 Private HS-grad Never-married 31.0 61558.998090 <50k
7 Private Some-college Never-married 28.0 185126.999910 <50k
education-num_na education-num salary
0 False 10.0 <50k
1 False 9.0 <50k
2 False 9.0 <50k
3 False 10.0 <50k
4 False 9.0 <50k
5 False 9.0 <50k
6 False 11.0 <50k
7 False 7.0 <50k
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
X = np.repeat(np.repeat(np.arange(16)[:, None, None], 2, 1), 5, 2).astype(float)
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms, bs=4)
for xb, yb in iter(dls1.train):
    print(xb.data, yb)
tensor([[[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]],

        [[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]]], device='mps:0') tensor([5., 7., 6., 3.], device='mps:0')
tensor([[[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], device='mps:0') tensor([4., 1., 2., 0.], device='mps:0')
data = np.repeat(np.arange(16)[:, None], 3, 1)*np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
display(df)
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
                                 ], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=4)
for b in iter(dls2.train):
    print(b[0], b[1], b[2])
cat1 cat2 cont target
0 0 0 0.0 0
1 1 10 100.0 1
2 2 20 200.0 2
3 3 30 300.0 3
4 4 40 400.0 4
5 5 50 500.0 5
6 6 60 600.0 6
7 7 70 700.0 7
8 8 80 800.0 0
9 9 90 900.0 1
10 10 100 1000.0 2
11 11 110 1100.0 3
12 12 120 1200.0 4
13 13 130 1300.0 5
14 14 140 1400.0 6
15 15 150 1500.0 7
tensor([[6, 6],
        [5, 5],
        [4, 4],
        [8, 8]], device='mps:0') tensor([[500.],
        [400.],
        [300.],
        [700.]], device='mps:0') tensor([[5],
        [4],
        [3],
        [7]], device='mps:0', dtype=torch.int8)
tensor([[6, 6],
        [4, 4],
        [2, 2],
        [2, 2]], device='mps:0') tensor([[500.],
        [300.],
        [100.],
        [100.]], device='mps:0') tensor([[5],
        [3],
        [1],
        [1]], device='mps:0', dtype=torch.int8)
bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long().cpu())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long().cpu())
bs = 4
dls = get_mixed_dls(dls1, dls2, bs=bs)
for xb, yb in iter(dls.train):
    print(xb[0].data, xb[1], yb)
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]],

        [[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]]], device='mps:0') (tensor([[1, 1],
        [2, 2],
        [5, 5],
        [7, 7]], device='mps:0'), tensor([[  0.],
        [100.],
        [400.],
        [600.]], device='mps:0')) tensor([0., 1., 4., 6.], device='mps:0')
tensor([[[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]],

        [[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]]], device='mps:0') (tensor([[3, 3],
        [4, 4],
        [6, 6],
        [8, 8]], device='mps:0'), tensor([[200.],
        [300.],
        [500.],
        [700.]], device='mps:0')) tensor([2., 3., 5., 7.], device='mps:0')