from tsai.data.tabular import *
Mixed data
DataLoader than can take data from multiple dataloaders with different types of data
MixedDataLoaders
MixedDataLoaders (*loaders, path:str|pathlib.Path='.', device=None)
Basic wrapper around several DataLoader
s.
Type | Default | Details | |
---|---|---|---|
loaders | VAR_POSITIONAL | DataLoader objects to wrap |
|
path | str | pathlib.Path | . | Path to store export objects |
device | NoneType | None | Device to put DataLoaders |
MixedDataLoader
MixedDataLoader (*loaders, path='.', shuffle=False, device=None, bs=None)
Accepts any number of DataLoader
and a device
get_mixed_dls
get_mixed_dls (*dls, device=None, shuffle_train=None, shuffle_valid=None, **kwargs)
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df # df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
= 'salary'
target = RandomSplitter()(range_of(df))
splits
= ['workclass', 'education', 'marital-status']
cat_names = ['age', 'fnlwgt']
cont_names = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1
dls1.show_batch()
= None #['occupation', 'relationship', 'race']
cat_names = ['education-num']
cont_names = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2 dls2.show_batch()
workclass | education | marital-status | age | fnlwgt | salary | |
---|---|---|---|---|---|---|
0 | Private | Bachelors | Married-civ-spouse | 38.0 | 95335.998904 | >=50k |
1 | Private | HS-grad | Never-married | 23.0 | 292023.001113 | <50k |
2 | Self-emp-not-inc | HS-grad | Never-married | 37.0 | 154641.001507 | <50k |
3 | Private | HS-grad | Divorced | 35.0 | 82621.994134 | <50k |
4 | Local-gov | Bachelors | Never-married | 33.0 | 161942.000197 | <50k |
5 | Self-emp-inc | HS-grad | Married-civ-spouse | 65.0 | 81412.997852 | <50k |
6 | Private | HS-grad | Married-civ-spouse | 43.0 | 122749.002328 | <50k |
7 | Private | Some-college | Married-civ-spouse | 35.0 | 163391.999564 | <50k |
8 | Private | HS-grad | Married-civ-spouse | 35.0 | 214890.999959 | <50k |
9 | Private | Bachelors | Never-married | 22.0 | 140000.999474 | <50k |
education-num_na | education-num | salary | |
---|---|---|---|
0 | False | 14.0 | <50k |
1 | False | 9.0 | <50k |
2 | False | 10.0 | <50k |
3 | True | 10.0 | <50k |
4 | False | 9.0 | <50k |
5 | False | 10.0 | <50k |
6 | False | 9.0 | <50k |
7 | False | 13.0 | <50k |
8 | False | 9.0 | <50k |
9 | False | 9.0 | >=50k |
= get_mixed_dls(dls1, dls2, bs=8)
dls
first(dls.train)
first(dls.valid)'export/mixed_dls.pth')
torch.save(dls,del dls
= torch.load('export/mixed_dls.pth', weights_only=False)
dls dls.train.show_batch()
workclass | education | marital-status | age | fnlwgt | salary | |
---|---|---|---|---|---|---|
0 | Private | Some-college | Married-civ-spouse | 23.0 | 38706.999913 | >=50k |
1 | Private | Bachelors | Never-married | 43.0 | 351576.003599 | >=50k |
2 | Local-gov | 10th | Married-civ-spouse | 42.0 | 180984.999698 | >=50k |
3 | Local-gov | Bachelors | Never-married | 34.0 | 183800.999868 | <50k |
4 | Private | HS-grad | Married-civ-spouse | 39.0 | 230329.000550 | <50k |
5 | Private | 10th | Married-civ-spouse | 51.0 | 41473.998101 | <50k |
6 | Private | HS-grad | Married-civ-spouse | 41.0 | 90021.001512 | <50k |
7 | Private | Assoc-acdm | Never-married | 30.0 | 251825.001788 | <50k |
education-num_na | education-num | salary | |
---|---|---|---|
0 | False | 12.0 | <50k |
1 | False | 9.0 | <50k |
2 | False | 13.0 | <50k |
3 | False | 10.0 | >=50k |
4 | False | 15.0 | >=50k |
5 | False | 13.0 | <50k |
6 | False | 10.0 | <50k |
7 | False | 13.0 | >=50k |
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
= np.repeat(np.repeat(np.arange(16)[:, None, None], 2, 1), 5, 2).astype(float)
X = np.concatenate([np.arange(len(X)//2)]*2)
y = np.array(list(string.ascii_lowercase))
alphabet # y = alphabet[y]
= TimeSplitter(.5, show_plot=False)(range_of(X))
splits = [None, TSRegression()]
tfms = get_ts_dls(X, y, splits=splits, tfms=tfms, bs=4)
dls1 for xb, yb in iter(dls1.train):
print(xb.data, yb)
tensor([[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[3., 3., 3., 3., 3.],
[3., 3., 3., 3., 3.]],
[[2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2.]],
[[5., 5., 5., 5., 5.],
[5., 5., 5., 5., 5.]]], device='mps:0') tensor([1., 3., 2., 5.], device='mps:0')
tensor([[[6., 6., 6., 6., 6.],
[6., 6., 6., 6., 6.]],
[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[7., 7., 7., 7., 7.],
[7., 7., 7., 7., 7.]],
[[4., 4., 4., 4., 4.],
[4., 4., 4., 4., 4.]]], device='mps:0') tensor([6., 0., 7., 4.], device='mps:0')
= np.repeat(np.arange(16)[:, None], 3, 1)*np.array([1, 10, 100])
data = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df 'cont'] = df['cont'].astype(float)
df['target'] = y
df[
display(df)= ['cat1', 'cat2']
cat_names = ['cont']
cont_names = 'target'
target = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
dls2 =cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=4)
], cat_namesfor b in iter(dls2.train):
print(b[0], b[1], b[2])
cat1 | cat2 | cont | target | |
---|---|---|---|---|
0 | 0 | 0 | 0.0 | 0 |
1 | 1 | 10 | 100.0 | 1 |
2 | 2 | 20 | 200.0 | 2 |
3 | 3 | 30 | 300.0 | 3 |
4 | 4 | 40 | 400.0 | 4 |
5 | 5 | 50 | 500.0 | 5 |
6 | 6 | 60 | 600.0 | 6 |
7 | 7 | 70 | 700.0 | 7 |
8 | 8 | 80 | 800.0 | 0 |
9 | 9 | 90 | 900.0 | 1 |
10 | 10 | 100 | 1000.0 | 2 |
11 | 11 | 110 | 1100.0 | 3 |
12 | 12 | 120 | 1200.0 | 4 |
13 | 13 | 130 | 1300.0 | 5 |
14 | 14 | 140 | 1400.0 | 6 |
15 | 15 | 150 | 1500.0 | 7 |
tensor([[8, 8],
[6, 6],
[2, 2],
[4, 4]], device='mps:0') tensor([[700.],
[500.],
[100.],
[300.]], device='mps:0') tensor([[7],
[5],
[1],
[3]], device='mps:0', dtype=torch.int8)
tensor([[6, 6],
[3, 3],
[1, 1],
[6, 6]], device='mps:0') tensor([[500.],
[200.],
[ 0.],
[500.]], device='mps:0') tensor([[5],
[2],
[0],
[5]], device='mps:0', dtype=torch.int8)
= 8
bs = get_mixed_dls(dls1, dls2, bs=bs)
dls = dls.train
dl = dl.one_batch()
xb, yb len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(xb[long().cpu())
test_eq(tensor(dl.input_idxs), yb.= dls.valid
dl = dl.one_batch()
xb, yb long().cpu()) test_eq(tensor(y[dl.input_idxs]), yb.
= 4
bs = get_mixed_dls(dls1, dls2, bs=bs)
dls for xb, yb in iter(dls.train):
print(xb[0].data, xb[1], yb)
tensor([[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2.]],
[[4., 4., 4., 4., 4.],
[4., 4., 4., 4., 4.]],
[[6., 6., 6., 6., 6.],
[6., 6., 6., 6., 6.]]], device='mps:0') (tensor([[2, 2],
[3, 3],
[5, 5],
[7, 7]], device='mps:0'), tensor([[100.],
[200.],
[400.],
[600.]], device='mps:0')) tensor([1., 2., 4., 6.], device='mps:0')
tensor([[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[3., 3., 3., 3., 3.],
[3., 3., 3., 3., 3.]],
[[5., 5., 5., 5., 5.],
[5., 5., 5., 5., 5.]],
[[7., 7., 7., 7., 7.],
[7., 7., 7., 7., 7.]]], device='mps:0') (tensor([[1, 1],
[4, 4],
[6, 6],
[8, 8]], device='mps:0'), tensor([[ 0.],
[300.],
[500.],
[700.]], device='mps:0')) tensor([0., 3., 5., 7.], device='mps:0')