= torch.rand(16, 50, 128)
t = _MultiHeadAttention(d_model=128, n_heads=3, d_k=8, d_v=6)(t, t, t)
output, attn output.shape, attn.shape
(torch.Size([16, 50, 128]), torch.Size([16, 3, 50, 50]))
This is an custom PyTorch implementation by @yangtzech, based on TST implementation of Ignacio Oguiza.
Usual values are the ones that appear in the “Attention is all you need” and “A Transformer-based Framework for Multivariate Time Series Representation Learning” papers. And some parameters are necessary for the RNN part.
The default values are the ones selected as a default configuration in the latter.
False
, then the layer does not use bias weights b_ih
and b_hh
. Default: True
Dropout
layer on the outputs of each RNN layer except the last layer, with dropout probability equal to :attr:rnn_dropout
. Default: 0True
, becomes a bidirectional RNN. Default: False
t = torch.rand(16, 50, 128)
output, attn = _MultiHeadAttention(d_model=128, n_heads=3, d_k=8, d_v=6)(t, t, t)
output.shape, attn.shape
(torch.Size([16, 50, 128]), torch.Size([16, 3, 50, 50]))
t = torch.rand(16, 50, 128)
output = _TSTEncoderLayer(q_len=50, d_model=128, n_heads=3, d_k=None, d_v=None, d_ff=512, dropout=0.1, activation='gelu')(t)
output.shape
torch.Size([16, 50, 128])
GRUAttention (c_in:int, c_out:int, seq_len:int, hidden_size=128, rnn_layers=1, bias=True, rnn_dropout=0, bidirectional=False, encoder_layers:int=3, n_heads:int=16, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, encoder_dropout:float=0.1, act:str='gelu', fc_dropout:float=0.0, y_range:Optional[tuple]=None, verbose:bool=False, custom_head=None)
Same as nn.Module
, but no need for subclasses to call super().__init__
LSTMAttention (c_in:int, c_out:int, seq_len:int, hidden_size=128, rnn_layers=1, bias=True, rnn_dropout=0, bidirectional=False, encoder_layers:int=3, n_heads:int=16, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, encoder_dropout:float=0.1, act:str='gelu', fc_dropout:float=0.0, y_range:Optional[tuple]=None, verbose:bool=False, custom_head=None)
Same as nn.Module
, but no need for subclasses to call super().__init__
RNNAttention (c_in:int, c_out:int, seq_len:int, hidden_size=128, rnn_layers=1, bias=True, rnn_dropout=0, bidirectional=False, encoder_layers:int=3, n_heads:int=16, d_k:Optional[int]=None, d_v:Optional[int]=None, d_ff:int=256, encoder_dropout:float=0.1, act:str='gelu', fc_dropout:float=0.0, y_range:Optional[tuple]=None, verbose:bool=False, custom_head=None)
Same as nn.Module
, but no need for subclasses to call super().__init__
bs = 32
c_in = 9 # aka channels, features, variables, dimensions
c_out = 2
seq_len = 500
xb = torch.randn(bs, c_in, seq_len)
# standardize by channel by_var based on the training set
xb = (xb - xb.mean((0, 2), keepdim=True)) / xb.std((0, 2), keepdim=True)
# Settings
hidden_size = 128
rnn_layers=1
bias=True
rnn_dropout=0
bidirectional=False
encoder_layers=3
n_heads = 16
d_k = d_v = None # if None --> d_model // n_heads
d_ff = 256
encoder_dropout = 0.1
act = "gelu"
fc_dropout = 0.1
kwargs = {}
model = RNNAttention(c_in, c_out, seq_len, hidden_size=hidden_size, rnn_layers=rnn_layers, bias=bias, rnn_dropout=rnn_dropout, bidirectional=bidirectional,
encoder_layers=encoder_layers, n_heads=n_heads,
d_k=d_k, d_v=d_v, d_ff=d_ff, encoder_dropout=encoder_dropout, act=act,
fc_dropout=fc_dropout, **kwargs)
test_eq(model.to(xb.device)(xb).shape, [bs, c_out])
print(f'model parameters: {count_parameters(model)}')
model parameters: 541698
bs = 32
c_in = 9 # aka channels, features, variables, dimensions
c_out = 2
seq_len = 60
xb = torch.randn(bs, c_in, seq_len)
# standardize by channel by_var based on the training set
xb = (xb - xb.mean((0, 2), keepdim=True)) / xb.std((0, 2), keepdim=True)
# Settings
hidden_size = 128
rnn_layers=1
bias=True
rnn_dropout=0
bidirectional=False
encoder_layers=3
n_heads = 16
d_k = d_v = None # if None --> d_model // n_heads
d_ff = 256
encoder_dropout = 0.1
act = "gelu"
fc_dropout = 0.1
kwargs = {}
# kwargs = dict(kernel_size=5, padding=2)
model = RNNAttention(c_in, c_out, seq_len, hidden_size=hidden_size, rnn_layers=rnn_layers, bias=bias, rnn_dropout=rnn_dropout, bidirectional=bidirectional,
encoder_layers=encoder_layers, n_heads=n_heads,
d_k=d_k, d_v=d_v, d_ff=d_ff, encoder_dropout=encoder_dropout, act=act,
fc_dropout=fc_dropout, **kwargs)
test_eq(model.to(xb.device)(xb).shape, [bs, c_out])
print(f'model parameters: {count_parameters(model)}')
model parameters: 429058