from tsai.data.tabular import *Mixed data
DataLoader than can take data from multiple dataloaders with different types of data
MixedDataLoaders
def MixedDataLoaders(
loaders:VAR_POSITIONAL, # `DataLoader` objects to wrap
path:str | Path='.', # Path to store export objects
device:NoneType=None, # Device to put `DataLoaders`
):
Basic wrapper around several DataLoaders.
MixedDataLoader
def MixedDataLoader(
loaders:VAR_POSITIONAL, path:str='.', shuffle:bool=False, device:NoneType=None, bs:NoneType=None
):
Accepts any number of DataLoader and a device
get_mixed_dls
def get_mixed_dls(
dls:VAR_POSITIONAL, device:NoneType=None, shuffle_train:NoneType=None, shuffle_valid:NoneType=None,
kwargs:VAR_KEYWORD
):
Call self as a function.
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))
cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()
cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()| workclass | education | marital-status | age | fnlwgt | salary | |
|---|---|---|---|---|---|---|
| 0 | Private | Some-college | Never-married | 22.000000 | 133832.998222 | <50k |
| 1 | State-gov | HS-grad | Separated | 46.000000 | 111163.001785 | <50k |
| 2 | ? | Masters | Never-married | 50.000000 | 22427.997140 | <50k |
| 3 | Self-emp-not-inc | HS-grad | Married-civ-spouse | 43.000000 | 38876.004820 | <50k |
| 4 | Private | 7th-8th | Never-married | 49.000000 | 240868.998836 | <50k |
| 5 | ? | HS-grad | Widowed | 73.000000 | 200878.000171 | <50k |
| 6 | Private | HS-grad | Married-civ-spouse | 38.000000 | 103323.001572 | <50k |
| 7 | Private | Bachelors | Never-married | 45.000000 | 51744.000875 | <50k |
| 8 | Private | HS-grad | Divorced | 28.000000 | 430084.000496 | <50k |
| 9 | Self-emp-not-inc | 7th-8th | Married-civ-spouse | 60.000001 | 127805.000397 | >=50k |
| education-num_na | education-num | salary | |
|---|---|---|---|
| 0 | False | 9.0 | <50k |
| 1 | False | 9.0 | <50k |
| 2 | False | 13.0 | <50k |
| 3 | False | 10.0 | <50k |
| 4 | False | 5.0 | <50k |
| 5 | False | 10.0 | <50k |
| 6 | False | 2.0 | <50k |
| 7 | False | 8.0 | <50k |
| 8 | False | 6.0 | >=50k |
| 9 | False | 10.0 | >=50k |
dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth', weights_only=False)
dls.train.show_batch()| workclass | education | marital-status | age | fnlwgt | salary | |
|---|---|---|---|---|---|---|
| 0 | Self-emp-not-inc | 7th-8th | Married-civ-spouse | 45.0 | 285335.001819 | <50k |
| 1 | Private | HS-grad | Married-civ-spouse | 32.0 | 178615.000066 | <50k |
| 2 | Private | Some-college | Never-married | 25.0 | 111058.002708 | <50k |
| 3 | Private | HS-grad | Married-civ-spouse | 57.0 | 178352.999653 | >=50k |
| 4 | Private | HS-grad | Never-married | 35.0 | 136343.002194 | <50k |
| 5 | Private | HS-grad | Married-civ-spouse | 53.0 | 104879.001541 | >=50k |
| 6 | Private | HS-grad | Never-married | 31.0 | 61558.998090 | <50k |
| 7 | Private | Some-college | Never-married | 28.0 | 185126.999910 | <50k |
| education-num_na | education-num | salary | |
|---|---|---|---|
| 0 | False | 10.0 | <50k |
| 1 | False | 9.0 | <50k |
| 2 | False | 9.0 | <50k |
| 3 | False | 10.0 | <50k |
| 4 | False | 9.0 | <50k |
| 5 | False | 9.0 | <50k |
| 6 | False | 11.0 | <50k |
| 7 | False | 7.0 | <50k |
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dlsX = np.repeat(np.repeat(np.arange(16)[:, None, None], 2, 1), 5, 2).astype(float)
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms, bs=4)
for xb, yb in iter(dls1.train):
print(xb.data, yb)tensor([[[5., 5., 5., 5., 5.],
[5., 5., 5., 5., 5.]],
[[7., 7., 7., 7., 7.],
[7., 7., 7., 7., 7.]],
[[6., 6., 6., 6., 6.],
[6., 6., 6., 6., 6.]],
[[3., 3., 3., 3., 3.],
[3., 3., 3., 3., 3.]]], device='mps:0') tensor([5., 7., 6., 3.], device='mps:0')
tensor([[[4., 4., 4., 4., 4.],
[4., 4., 4., 4., 4.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2.]],
[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]]], device='mps:0') tensor([4., 1., 2., 0.], device='mps:0')
data = np.repeat(np.arange(16)[:, None], 3, 1)*np.array([1, 10, 100])
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
display(df)
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=4)
for b in iter(dls2.train):
print(b[0], b[1], b[2])| cat1 | cat2 | cont | target | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0.0 | 0 |
| 1 | 1 | 10 | 100.0 | 1 |
| 2 | 2 | 20 | 200.0 | 2 |
| 3 | 3 | 30 | 300.0 | 3 |
| 4 | 4 | 40 | 400.0 | 4 |
| 5 | 5 | 50 | 500.0 | 5 |
| 6 | 6 | 60 | 600.0 | 6 |
| 7 | 7 | 70 | 700.0 | 7 |
| 8 | 8 | 80 | 800.0 | 0 |
| 9 | 9 | 90 | 900.0 | 1 |
| 10 | 10 | 100 | 1000.0 | 2 |
| 11 | 11 | 110 | 1100.0 | 3 |
| 12 | 12 | 120 | 1200.0 | 4 |
| 13 | 13 | 130 | 1300.0 | 5 |
| 14 | 14 | 140 | 1400.0 | 6 |
| 15 | 15 | 150 | 1500.0 | 7 |
tensor([[6, 6],
[5, 5],
[4, 4],
[8, 8]], device='mps:0') tensor([[500.],
[400.],
[300.],
[700.]], device='mps:0') tensor([[5],
[4],
[3],
[7]], device='mps:0', dtype=torch.int8)
tensor([[6, 6],
[4, 4],
[2, 2],
[2, 2]], device='mps:0') tensor([[500.],
[300.],
[100.],
[100.]], device='mps:0') tensor([[5],
[3],
[1],
[1]], device='mps:0', dtype=torch.int8)
bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long().cpu())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long().cpu())bs = 4
dls = get_mixed_dls(dls1, dls2, bs=bs)
for xb, yb in iter(dls.train):
print(xb[0].data, xb[1], yb)tensor([[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]],
[[4., 4., 4., 4., 4.],
[4., 4., 4., 4., 4.]],
[[6., 6., 6., 6., 6.],
[6., 6., 6., 6., 6.]]], device='mps:0') (tensor([[1, 1],
[2, 2],
[5, 5],
[7, 7]], device='mps:0'), tensor([[ 0.],
[100.],
[400.],
[600.]], device='mps:0')) tensor([0., 1., 4., 6.], device='mps:0')
tensor([[[2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2.]],
[[3., 3., 3., 3., 3.],
[3., 3., 3., 3., 3.]],
[[5., 5., 5., 5., 5.],
[5., 5., 5., 5., 5.]],
[[7., 7., 7., 7., 7.],
[7., 7., 7., 7., 7.]]], device='mps:0') (tensor([[3, 3],
[4, 4],
[6, 6],
[8, 8]], device='mps:0'), tensor([[200.],
[300.],
[500.],
[700.]], device='mps:0')) tensor([2., 3., 5., 7.], device='mps:0')