Mixed data

DataLoader than can take data from multiple dataloaders with different types of data


source

MixedDataLoaders

 MixedDataLoaders (*loaders, path:str|pathlib.Path='.', device=None)

Basic wrapper around several DataLoaders.

Type Default Details
loaders VAR_POSITIONAL DataLoader objects to wrap
path str | pathlib.Path . Path to store export objects
device NoneType None Device to put DataLoaders

source

MixedDataLoader

 MixedDataLoader (*loaders, path='.', shuffle=False, device=None, bs=None)

Accepts any number of DataLoader and a device


source

get_mixed_dls

 get_mixed_dls (*dls, device=None, shuffle_train=None, shuffle_valid=None,
                **kwargs)
from tsai.data.tabular import *
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))

cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()

cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()
workclass education marital-status age fnlwgt salary
0 Private Bachelors Married-civ-spouse 38.0 95335.998904 >=50k
1 Private HS-grad Never-married 23.0 292023.001113 <50k
2 Self-emp-not-inc HS-grad Never-married 37.0 154641.001507 <50k
3 Private HS-grad Divorced 35.0 82621.994134 <50k
4 Local-gov Bachelors Never-married 33.0 161942.000197 <50k
5 Self-emp-inc HS-grad Married-civ-spouse 65.0 81412.997852 <50k
6 Private HS-grad Married-civ-spouse 43.0 122749.002328 <50k
7 Private Some-college Married-civ-spouse 35.0 163391.999564 <50k
8 Private HS-grad Married-civ-spouse 35.0 214890.999959 <50k
9 Private Bachelors Never-married 22.0 140000.999474 <50k
education-num_na education-num salary
0 False 14.0 <50k
1 False 9.0 <50k
2 False 10.0 <50k
3 True 10.0 <50k
4 False 9.0 <50k
5 False 10.0 <50k
6 False 9.0 <50k
7 False 13.0 <50k
8 False 9.0 <50k
9 False 9.0 >=50k
dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth', weights_only=False)
dls.train.show_batch()
workclass education marital-status age fnlwgt salary
0 Private Some-college Married-civ-spouse 23.0 38706.999913 >=50k
1 Private Bachelors Never-married 43.0 351576.003599 >=50k
2 Local-gov 10th Married-civ-spouse 42.0 180984.999698 >=50k
3 Local-gov Bachelors Never-married 34.0 183800.999868 <50k
4 Private HS-grad Married-civ-spouse 39.0 230329.000550 <50k
5 Private 10th Married-civ-spouse 51.0 41473.998101 <50k
6 Private HS-grad Married-civ-spouse 41.0 90021.001512 <50k
7 Private Assoc-acdm Never-married 30.0 251825.001788 <50k
education-num_na education-num salary
0 False 12.0 <50k
1 False 9.0 <50k
2 False 13.0 <50k
3 False 10.0 >=50k
4 False 15.0 >=50k
5 False 13.0 <50k
6 False 10.0 <50k
7 False 13.0 >=50k
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
X = np.repeat(np.repeat(np.arange(16)[:, None, None], 2, 1), 5, 2).astype(float)
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms, bs=4)
for xb, yb in iter(dls1.train):
    print(xb.data, yb)
tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]],

        [[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]]], device='mps:0') tensor([1., 3., 2., 5.], device='mps:0')
tensor([[[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]],

        [[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]]], device='mps:0') tensor([6., 0., 7., 4.], device='mps:0')
data = np.repeat(np.arange(16)[:, None], 3, 1)*np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
display(df)
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
                                 ], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=4)
for b in iter(dls2.train):
    print(b[0], b[1], b[2])
cat1 cat2 cont target
0 0 0 0.0 0
1 1 10 100.0 1
2 2 20 200.0 2
3 3 30 300.0 3
4 4 40 400.0 4
5 5 50 500.0 5
6 6 60 600.0 6
7 7 70 700.0 7
8 8 80 800.0 0
9 9 90 900.0 1
10 10 100 1000.0 2
11 11 110 1100.0 3
12 12 120 1200.0 4
13 13 130 1300.0 5
14 14 140 1400.0 6
15 15 150 1500.0 7
tensor([[8, 8],
        [6, 6],
        [2, 2],
        [4, 4]], device='mps:0') tensor([[700.],
        [500.],
        [100.],
        [300.]], device='mps:0') tensor([[7],
        [5],
        [1],
        [3]], device='mps:0', dtype=torch.int8)
tensor([[6, 6],
        [3, 3],
        [1, 1],
        [6, 6]], device='mps:0') tensor([[500.],
        [200.],
        [  0.],
        [500.]], device='mps:0') tensor([[5],
        [2],
        [0],
        [5]], device='mps:0', dtype=torch.int8)
bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long().cpu())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long().cpu())
bs = 4
dls = get_mixed_dls(dls1, dls2, bs=bs)
for xb, yb in iter(dls.train):
    print(xb[0].data, xb[1], yb)
tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]],

        [[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]]], device='mps:0') (tensor([[2, 2],
        [3, 3],
        [5, 5],
        [7, 7]], device='mps:0'), tensor([[100.],
        [200.],
        [400.],
        [600.]], device='mps:0')) tensor([1., 2., 4., 6.], device='mps:0')
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]],

        [[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]]], device='mps:0') (tensor([[1, 1],
        [4, 4],
        [6, 6],
        [8, 8]], device='mps:0'), tensor([[  0.],
        [300.],
        [500.],
        [700.]], device='mps:0')) tensor([0., 3., 5., 7.], device='mps:0')