diff --git a/SMDL/Day5/lessons/task2/position_encoding.py b/SMDL/Day5/lessons/task2/position_encoding.py index 488899b..98a3464 100644 --- a/SMDL/Day5/lessons/task2/position_encoding.py +++ b/SMDL/Day5/lessons/task2/position_encoding.py @@ -10,7 +10,33 @@ class TokenAndPositionEmbedding(Layer): def __init__(self, maxlen, vocab_size, embed_dim): super(TokenAndPositionEmbedding, self).__init__() self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim) - self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim) + self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim, + weights=[self.init_sinusoid_table(maxlen, embed_dim)]) + + def init_sinusoid_table(self, n_positions, embed_dim): + # Initializes the sinusoid position encoding table + # Formula: + # sin(t/10000^(2k/embed_dim)) when i=2k (even) + # cos(t/10000^(2k/embed_dim)) when i=2k+1 (odd) + # + # t = position being encoded + # i = dimension index (from 0 up to embed_dim + # k = quotient when i is divided by 2 + position_enc = np.array([ + [t / np.power(10000, 2 * (i // 2) / embed_dim) for i in range(embed_dim)] + for t in range(n_positions)]) + + # apply sine on even embedding dimensions + # [1:, means start with token index 1 (token index 0 is reserved for the padding token) + # ,0::2] means skip every 2 starting from 0th embedding dim (i.e. even embedding dim) + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) + + # apply cosine on odd embedding dimensions + # [1:, means start with token index 1 (token index 0 is reserved for the padding token) + # ,1::2] means skip every 2 starting from 1st embedding dim (i.e. odd embedding dim) + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) + + return position_enc def call(self, x): maxlen = tf.shape(x)[-1] diff --git a/SMDL/Day5/lessons/task2/positional_encoding.md b/SMDL/Day5/lessons/task2/positional_encoding.md new file mode 100644 index 0000000..367b228 --- /dev/null +++ b/SMDL/Day5/lessons/task2/positional_encoding.md @@ -0,0 +1,5 @@ +## Positional Encoding + +![formula](sinusoidal_formula.png) + +Reference: https://kazemnejad.com/blog/transformer_architecture_positional_encoding diff --git a/SMDL/Day5/lessons/task2/sinusoidal_formula.png b/SMDL/Day5/lessons/task2/sinusoidal_formula.png new file mode 100644 index 0000000..219bd8e Binary files /dev/null and b/SMDL/Day5/lessons/task2/sinusoidal_formula.png differ diff --git a/SMDL/Day5/lessons/task2/task-info.yaml b/SMDL/Day5/lessons/task2/task-info.yaml index 6ebcf18..3f77175 100644 --- a/SMDL/Day5/lessons/task2/task-info.yaml +++ b/SMDL/Day5/lessons/task2/task-info.yaml @@ -9,3 +9,9 @@ files: visible: true - name: transformer.py visible: true +- name: sinusoidal_formula.png + visible: true +- name: positional_encoding.md + visible: true +- name: token_n_position_embedding.png + visible: true diff --git a/SMDL/Day5/lessons/task2/task.md b/SMDL/Day5/lessons/task2/task.md index 7499da0..6fb6db4 100644 --- a/SMDL/Day5/lessons/task2/task.md +++ b/SMDL/Day5/lessons/task2/task.md @@ -1,20 +1,48 @@ ## Transformer A Transformer consists of the following components: -- Positional Embedding: - - This is a way to encode sequence information without using a recurrent cell. We essentially pass the position indices (e.g. `[0, 1, 2, ...]`) as an extra input. Think of this like a cross-sectional approach where we pass in `[(0, token1), (1, token2), ....]` instead of `[token1, token 2, ...]`. - - An Embedding layer produces the encoding into vectors. - - Conceptually, once we have a position encoding (as a vector), we can compute similarity, compare relative positions of a token in different samples, etc. - - The position encoding is **added** to the token encoding, to create the combined (position, token) encoding. You can think of it as different "biases" to the token encoding, to indicate where the token is located in the sequence. -- Multi-headed Self-attention: - - Self-attention means the query-key and value are all on the **same** sequence (hence "self"). It allows learning of contextual information (attention) on the source sequence based on the words in the source sequence. In comparison, Vanilla Attention uses a target (i.e. a different) sequence to learn contextual information on the source sequence. - - Multi-headed means multiple parallel self-attention blocks are used. Analogous to having more neurons in Dense layer or channels in a Convolution layer to train more weights in parallel. +- Token & Position Embedding: +- Multi-headed Self-attention + +### 1. Token & Position Embedding + +This is a way to embed sequence information without using a recurrent cell. We essentially pass the position indices (e.g. `[0, 1, 2, ...]`) as an extra input. Think of this like a cross-sectional approach where we pass in `[(0, token1), (1, token2), ....]` instead of `[token1, token 2, ...]`. + +An Embedding layer produces the encoding into vectors. It is common to initialize the Embedding layer with a sinusoidal encoding. More details are described in [positional_encoding.md](positional_encoding.md) + +The position embedding is **added** to the token embedding, to create the combined (position, token) encoding. You can think of it as different "biases" to the token encoding, to indicate where the token is located in the sequence. + +![token_n_position_embedding](token_n_position_embedding.png) + +### 2. Multi-headed Self-attention: + +**Self-attention** means the query-key and value are all on the **same** sequence (hence "self"). It allows learning of contextual information (attention) on the source sequence based on the words in the source sequence. In comparison, Vanilla Attention uses a target (i.e. a different) sequence to learn contextual information on the source sequence. + +**Multi-headed** means multiple parallel self-attention blocks are used. Analogous to having more neurons in Dense layer or channels in a Convolution layer to train more weights in parallel. - ![internals](multiheaded_self_attention.png) +![internals](multiheaded_self_attention.png) + +## Putting it Together + +This is the architecture described in the famous [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. + +Note that while the paper describes an Encoder-Decoder model, Transformers can be used in non-encoder-decoder models as well, such as [text classification](https://keras.io/examples/nlp/text_classification_with_transformer/). They are essentially a replacement for LSTMs and GRUs. + +### (1/2): Transformer Encoder + +`transformer.py` demonstrates the encoder portion of the Transformer model. The encoder can be used as a feature extractor for a sequence learning task, effectively as a drop-in replacement for RNNs. + +The Encoder architecture is depicted in the left-hand side of this model (Decoder is described in the next section). +![transformer](https://www.tensorflow.org/images/tutorials/transformer/transformer.png) + +### (2/2): Transformer Decoder + +The Decoder is made up of similar building blocks as the encoder. It has a Transformer Block with two multi-headed self-attention layers. +- Positional-encoding is applied to the current target sequence, then fed into 1 layer of multi-headed self-attention. +- The output from the first layer is combined with the encoder output, into the 2nd multi-headed self-attention layer, etc. +- Finally, the output from the Decoder Transformer Block is passed through an MLP classifier to predict the next target token. -- Transformer Encoder. `transformer.py` demonstrates the encoder portion of the Transformer model. The encoder can be used as a feature extractor for a sequence learning task, effectively as a drop-in replacement for RNNs. - ![transformer](https://www.tensorflow.org/images/tutorials/transformer/transformer.png) -References: +### References - https://keras.io/examples/nlp/text_classification_with_transformer/: Basic Transformer block - https://www.tensorflow.org/tutorials/text/transformer: Application of Transformer in Encoder/Decoder. diff --git a/SMDL/Day5/lessons/task2/token_n_position_embedding.png b/SMDL/Day5/lessons/task2/token_n_position_embedding.png new file mode 100644 index 0000000..6442250 Binary files /dev/null and b/SMDL/Day5/lessons/task2/token_n_position_embedding.png differ diff --git a/SMDL/Day5/lessons/task2/transformer.py b/SMDL/Day5/lessons/task2/transformer.py index 2ff8d51..3ff61e5 100644 --- a/SMDL/Day5/lessons/task2/transformer.py +++ b/SMDL/Day5/lessons/task2/transformer.py @@ -110,7 +110,14 @@ def create_model(sequence_len, vocab_size, embed_dim, num_heads): model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) - model.fit(X, y, epochs=10, batch_size=2) + model.fit(X, y, epochs=30, batch_size=2) - pred = model.predict(vectorizer(['achieve my goals'])) - print(pred) + test = [['achieve my goals'], ['transformers are fun']] + pred = model.predict(vectorizer(test)) + + # pred is a 2D column vector, so we flatten it to a 1D vector + # np.round will round the probabilities to the nearest whole number + # label encoder only accepts integers, so we need .astype(int) to convert floats to ints + pred_classes = le.inverse_transform(np.round(pred.flatten()).astype(int)) + print(test) + print(pred_classes)