Im currently trying to study the effects of masking attention on a transformer model trained to classify time series data. My model works so far and give me okish performance, but when i try to mask the attention of all MultiHeadAttention
Layers in my model, the performance stays the same, which is quite not what i am expecting.
My Model (based of a keras tutorial):
def build(params: dict, input_shape:tuple) -> keras.Model:
#input_dim = 1
sequence_size = params["sequence_size"]
n_classes = params["n_classes"]
encoder_blocks = params["encoder_blocks"]
n_heads = params["encoder_heads"]
encolder_mlp = params["mlp_dim"]
conv_filters = params["conv_filters"]
encoder_dropout = params["encoder_dropout"]
mlp_dropout = params["mlp_dropout"]
learning_rate = params["learning_rate"]
inputs = keras.Input(shape=input_shape, name="sequence_input")
mask = keras.Input(shape=(sequence_size, sequence_size), name="mask_input")
x = inputs + SinePositionEncoding()(inputs)
for _ in range(encoder_blocks):
x = transformer_encoder(x, head_size=sequence_size, num_heads=n_heads, con_filters=conv_filters, attention_mask=mask, dropout=encoder_dropout, seed=SEED)
#x, _ = EncoderLayer(d_model=n_heads*5, num_heads= n_heads, dff=conv_filters, rate=encoder_dropout)(x, mask=mask)
x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
x = layers.Dense(encolder_mlp, activation="relu")(x)
x = layers.Dropout(mlp_dropout, seed=SEED)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
model = keras.Model(inputs=[inputs, mask], outputs=outputs)
modelpile(
loss="categorical_crossentropy",
optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
metrics=["categorical_accuracy", "f1_score"],
run_eagerly=False
)
return model
with my transformer_encoder
def transformer_encoder(inputs:np.ndarray, head_size:int, num_heads:int, con_filters:int, attention_mask, dropout=0, seed=42):
x, att = layers.MultiHeadAttention(
key_dim=head_size, num_heads=num_heads, dropout=dropout, seed=seed)(inputs, inputs, attention_mask=tf.ones((sequence_size, sequence_size), dtype=bool), return_attention_scores=True ,training=True)
tf.print(att)
#print(f"output: {x}")
#x, _ = MultiHeadAttention(d_model=num_heads*5, num_heads=num_heads)(inputs, inputs, inputs, attention_mask)
print(x)
x = layers.Dropout(dropout, seed=seed)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
res = x + inputs
x = layers.Conv1D(filters=con_filters, kernel_size=1, activation="relu")(res)
x = layers.Dropout(dropout, seed=seed)(x)
x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
return x + res
so far i have tried to pass a Mask with every Input and masking all of the attention with tf.zeros((sequence_size, sequence_size), dytpe=bool
. I also tried to change the shape of the Masks, but no luck either.
Anybody who knows an answer?