from tsai.data.tabular import *
Mixed data
DataLoader than can take data from multiple dataloaders with different types of data
MixedDataLoaders
MixedDataLoaders (*loaders, path:str|Path='.', device=None)
Basic wrapper around several DataLoader
s.
MixedDataLoader
MixedDataLoader (*loaders, path='.', shuffle=False, device=None, bs=None)
Accepts any number of DataLoader
and a device
get_mixed_dls
get_mixed_dls (*dls, device=None, shuffle_train=None, shuffle_valid=None, **kwargs)
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df # df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
= 'salary'
target = RandomSplitter()(range_of(df))
splits
= ['workclass', 'education', 'marital-status']
cat_names = ['age', 'fnlwgt']
cont_names = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1
dls1.show_batch()
= None #['occupation', 'relationship', 'race']
cat_names = ['education-num']
cont_names = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2 dls2.show_batch()
workclass | education | marital-status | age | fnlwgt | salary | |
---|---|---|---|---|---|---|
0 | Private | Bachelors | Married-civ-spouse | 59.999999 | 131680.999115 | >=50k |
1 | Private | 12th | Never-married | 18.000000 | 311795.000052 | <50k |
2 | Private | HS-grad | Married-civ-spouse | 45.000000 | 350440.002257 | >=50k |
3 | Local-gov | Masters | Never-married | 44.000000 | 101593.001253 | <50k |
4 | ? | Some-college | Never-married | 20.999999 | 41355.995576 | <50k |
5 | Private | Bachelors | Never-married | 30.000000 | 207668.000292 | <50k |
6 | Federal-gov | Bachelors | Never-married | 28.000000 | 281859.998606 | <50k |
7 | ? | Some-college | Never-married | 20.999999 | 180338.999810 | <50k |
8 | Private | Some-college | Never-married | 20.000000 | 174713.999509 | <50k |
9 | Self-emp-not-inc | Bachelors | Married-civ-spouse | 50.000000 | 334273.005863 | <50k |
education-num_na | education-num | salary | |
---|---|---|---|
0 | False | 9.0 | <50k |
1 | False | 9.0 | <50k |
2 | False | 13.0 | >=50k |
3 | False | 9.0 | <50k |
4 | False | 9.0 | <50k |
5 | False | 13.0 | >=50k |
6 | False | 10.0 | <50k |
7 | False | 10.0 | <50k |
8 | False | 13.0 | <50k |
9 | False | 10.0 | <50k |
= get_mixed_dls(dls1, dls2, bs=8)
dls
first(dls.train)
first(dls.valid)'export/mixed_dls.pth')
torch.save(dls,del dls
= torch.load('export/mixed_dls.pth')
dls dls.train.show_batch()
workclass | education | marital-status | age | fnlwgt | salary | |
---|---|---|---|---|---|---|
0 | State-gov | HS-grad | Never-married | 43.000000 | 23156.998049 | <50k |
1 | Private | 11th | Married-civ-spouse | 32.000000 | 140092.001434 | <50k |
2 | Self-emp-not-inc | HS-grad | Never-married | 43.000000 | 48086.995399 | <50k |
3 | Self-emp-not-inc | Assoc-acdm | Never-married | 34.000000 | 177638.999728 | <50k |
4 | Local-gov | Masters | Married-civ-spouse | 65.000001 | 146453.999176 | <50k |
5 | Private | HS-grad | Married-civ-spouse | 33.000000 | 227281.999333 | <50k |
6 | Private | HS-grad | Never-married | 33.000000 | 194900.999911 | <50k |
7 | Private | HS-grad | Divorced | 23.000000 | 259301.002460 | <50k |
education-num_na | education-num | salary | |
---|---|---|---|
0 | False | 9.0 | <50k |
1 | False | 7.0 | <50k |
2 | False | 9.0 | <50k |
3 | False | 12.0 | <50k |
4 | False | 14.0 | <50k |
5 | True | 10.0 | <50k |
6 | False | 9.0 | <50k |
7 | False | 9.0 | <50k |
= first(dls.train)
xb, yb xb
((tensor([[ 8, 12, 5],
[ 5, 2, 3],
[ 7, 12, 5],
[ 7, 8, 5],
[ 3, 13, 3],
[ 5, 12, 3],
[ 5, 12, 5],
[ 5, 12, 1]]),
tensor([[ 0.3222, -1.5782],
[-0.4850, -0.4696],
[ 0.3222, -1.3418],
[-0.3383, -0.1136],
[ 1.9368, -0.4093],
[-0.4117, 0.3570],
[-0.4117, 0.0500],
[-1.1455, 0.6606]])),
(tensor([[1],
[1],
[1],
[1],
[1],
[2],
[1],
[1]]),
tensor([[-0.4258],
[-1.2097],
[-0.4258],
[ 0.7502],
[ 1.5342],
[-0.0338],
[-0.4258],
[-0.4258]])))
= first(dls.train)
xs, ys 0][0].shape, xs[0][1].shape, xs[1][0].shape, xs[1][1].shape xs[
(torch.Size([8, 3]),
torch.Size([8, 2]),
torch.Size([8, 1]),
torch.Size([8, 1]))
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
= np.repeat(np.repeat(np.arange(8)[:, None, None], 2, 1), 5, 2).astype(float)
X = np.concatenate([X, X])
X = np.concatenate([np.arange(len(X)//2)]*2)
y = np.array(list(string.ascii_lowercase))
alphabet # y = alphabet[y]
= TimeSplitter(.5, show_plot=False)(range_of(X))
splits = [None, TSRegression()]
tfms = get_ts_dls(X, y, splits=splits, tfms=tfms)
dls1 dls1.one_batch()
(TSTensor(samples:8, vars:2, len:5, device=cpu, dtype=torch.float32),
tensor([7., 0., 2., 1., 5., 4., 3., 6.]))
= np.concatenate([np.repeat(np.arange(8)[:, None], 3, 1)*np.array([1, 10, 100])]*2)
data = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df 'cont'] = df['cont'].astype(float)
df['target'] = y
df[= ['cat1', 'cat2']
cat_names = ['cont']
cont_names = 'target'
target = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
dls2 =cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=8)
], cat_names dls2.one_batch()
(tensor([[2, 2],
[5, 5],
[1, 1],
[7, 7],
[3, 3],
[6, 6],
[8, 8],
[4, 4]]),
tensor([[100.],
[400.],
[ 0.],
[600.],
[200.],
[500.],
[700.],
[300.]]),
tensor([[1],
[4],
[0],
[6],
[2],
[5],
[7],
[3]], dtype=torch.int8))
= zip(_loaders[dls1.train.fake_l.num_workers == 0](dls1.train.fake_l))
z for b in z:
print(b)
break
((TSTensor(samples:8, vars:2, len:5, device=cpu, dtype=torch.float32), tensor([7., 0., 2., 1., 5., 4., 3., 6.])),)
= 8
bs = get_mixed_dls(dls1, dls2, bs=bs)
dls = dls.train
dl = dl.one_batch()
xb, yb len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(xb[long().cpu())
test_eq(tensor(dl.input_idxs), yb.= dls.valid
dl = dl.one_batch()
xb, yb long().cpu()) test_eq(tensor(y[dl.input_idxs]), yb.