def multi_target_attention(q, k, v, m, seq_name, num_heads=4):
    # query: [B, H]
    # key: [B, T, H]
    # mask: [B, T]
    q = tf.expand_dims(q, axis=1)
    q = tf.concat(tf.split(q, num_heads, axis=2), axis=0)
    k = tf.concat(tf.split(k, num_heads, axis=2), axis=0)
    v = tf.concat(tf.split(v, num_heads, axis=2), axis=0)
    q = layer_normal(q)
    k = layer_normal(k)
    outputs = tf.matmul(q, k, transpose_b=True)
    k_length = tf.shape(k)[1]
    outputs = outputs * tf.pow(tf.cast(k_length, outputs.dtype), -0.5)
    m = tf.expand_dims(m, axis=1)
    masks = tf.tile(m, [num_heads, 1, 1])
    masks = tf.equal(masks, tf.ones_like(masks))
    paddings = tf.fill(tf.shape(outputs), tf.constant(-2 ** 32 + 1, dtype=outputs.dtype))
    outputs = tf.where(masks, outputs, paddings)
    outputs = tf.nn.softmax(outputs)
    outputs = tf.matmul(outputs, v)
    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)
    outputs = tf.squeeze(outputs, axis=1)
    return outputs