Mixed data

DataLoader than can take data from multiple dataloaders with different types of data

MixedDataLoaders

 MixedDataLoaders (*loaders, path:str|pathlib.Path='.', device=None)

Basic wrapper around several DataLoaders.

	Type	Default	Details
loaders	VAR_POSITIONAL		`DataLoader` objects to wrap
path	str \| pathlib.Path	.	Path to store export objects
device	NoneType	None	Device to put `DataLoaders`

source

MixedDataLoader

 MixedDataLoader (*loaders, path='.', shuffle=False, device=None, bs=None)

Accepts any number of DataLoader and a device

source

get_mixed_dls

 get_mixed_dls (*dls, device=None, shuffle_train=None, shuffle_valid=None,
                **kwargs)

from tsai.data.tabular import *

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))

cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()

cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()

	workclass	education	marital-status	age	fnlwgt	salary
0	Private	Bachelors	Married-civ-spouse	38.0	95335.998904	>=50k
1	Private	HS-grad	Never-married	23.0	292023.001113	<50k
2	Self-emp-not-inc	HS-grad	Never-married	37.0	154641.001507	<50k
3	Private	HS-grad	Divorced	35.0	82621.994134	<50k
4	Local-gov	Bachelors	Never-married	33.0	161942.000197	<50k
5	Self-emp-inc	HS-grad	Married-civ-spouse	65.0	81412.997852	<50k
6	Private	HS-grad	Married-civ-spouse	43.0	122749.002328	<50k
7	Private	Some-college	Married-civ-spouse	35.0	163391.999564	<50k
8	Private	HS-grad	Married-civ-spouse	35.0	214890.999959	<50k
9	Private	Bachelors	Never-married	22.0	140000.999474	<50k

	education-num_na	education-num	salary
0	False	14.0	<50k
1	False	9.0	<50k
2	False	10.0	<50k
3	True	10.0	<50k
4	False	9.0	<50k
5	False	10.0	<50k
6	False	9.0	<50k
7	False	13.0	<50k
8	False	9.0	<50k
9	False	9.0	>=50k

dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth', weights_only=False)
dls.train.show_batch()

	workclass	education	marital-status	age	fnlwgt	salary
0	Private	Some-college	Married-civ-spouse	23.0	38706.999913	>=50k
1	Private	Bachelors	Never-married	43.0	351576.003599	>=50k
2	Local-gov	10th	Married-civ-spouse	42.0	180984.999698	>=50k
3	Local-gov	Bachelors	Never-married	34.0	183800.999868	<50k
4	Private	HS-grad	Married-civ-spouse	39.0	230329.000550	<50k
5	Private	10th	Married-civ-spouse	51.0	41473.998101	<50k
6	Private	HS-grad	Married-civ-spouse	41.0	90021.001512	<50k
7	Private	Assoc-acdm	Never-married	30.0	251825.001788	<50k

	education-num_na	education-num	salary
0	False	12.0	<50k
1	False	9.0	<50k
2	False	13.0	<50k
3	False	10.0	>=50k
4	False	15.0	>=50k
5	False	13.0	<50k
6	False	10.0	<50k
7	False	13.0	>=50k

from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls

X = np.repeat(np.repeat(np.arange(16)[:, None, None], 2, 1), 5, 2).astype(float)
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms, bs=4)
for xb, yb in iter(dls1.train):
    print(xb.data, yb)

tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]],

        [[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]]], device='mps:0') tensor([1., 3., 2., 5.], device='mps:0')
tensor([[[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]],

        [[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]]], device='mps:0') tensor([6., 0., 7., 4.], device='mps:0')

data = np.repeat(np.arange(16)[:, None], 3, 1)*np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
display(df)
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
                                 ], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=4)
for b in iter(dls2.train):
    print(b[0], b[1], b[2])

	cat1	cat2	cont	target
0	0	0	0.0	0
1	1	10	100.0	1
2	2	20	200.0	2
3	3	30	300.0	3
4	4	40	400.0	4
5	5	50	500.0	5
6	6	60	600.0	6
7	7	70	700.0	7
8	8	80	800.0	0
9	9	90	900.0	1
10	10	100	1000.0	2
11	11	110	1100.0	3
12	12	120	1200.0	4
13	13	130	1300.0	5
14	14	140	1400.0	6
15	15	150	1500.0	7

tensor([[8, 8],
        [6, 6],
        [2, 2],
        [4, 4]], device='mps:0') tensor([[700.],
        [500.],
        [100.],
        [300.]], device='mps:0') tensor([[7],
        [5],
        [1],
        [3]], device='mps:0', dtype=torch.int8)
tensor([[6, 6],
        [3, 3],
        [1, 1],
        [6, 6]], device='mps:0') tensor([[500.],
        [200.],
        [  0.],
        [500.]], device='mps:0') tensor([[5],
        [2],
        [0],
        [5]], device='mps:0', dtype=torch.int8)

bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long().cpu())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long().cpu())

bs = 4
dls = get_mixed_dls(dls1, dls2, bs=bs)
for xb, yb in iter(dls.train):
    print(xb[0].data, xb[1], yb)

tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2.]],

        [[4., 4., 4., 4., 4.],
         [4., 4., 4., 4., 4.]],

        [[6., 6., 6., 6., 6.],
         [6., 6., 6., 6., 6.]]], device='mps:0') (tensor([[2, 2],
        [3, 3],
        [5, 5],
        [7, 7]], device='mps:0'), tensor([[100.],
        [200.],
        [400.],
        [600.]], device='mps:0')) tensor([1., 2., 4., 6.], device='mps:0')
tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3.]],

        [[5., 5., 5., 5., 5.],
         [5., 5., 5., 5., 5.]],

        [[7., 7., 7., 7., 7.],
         [7., 7., 7., 7., 7.]]], device='mps:0') (tensor([[1, 1],
        [4, 4],
        [6, 6],
        [8, 8]], device='mps:0'), tensor([[  0.],
        [300.],
        [500.],
        [700.]], device='mps:0')) tensor([0., 3., 5., 7.], device='mps:0')