Attention is all you need: Transformer 정리

Self-Attention

Untitled

import torch
import torch.nn.functional as F

# Sample text
sentence = "The cat sat on the mat"

# Tokenize the sentence (naive tokenization for simplicity)
tokens = sentence.lower().split()

# Simulate an embedding layer: map each token to a random embedding
# In practice, use pre-trained embeddings or an embedding layer of a model
token_to_embedding = {token: torch.randn(1, 4) for token in set(tokens)}  # 4-dimensional embeddings

# Convert tokens to embeddings
embeddings = torch.cat([token_to_embedding[token] for token in tokens], dim=0)

# Learned weight matrices (randomly initialized for this example)
W_q = torch.randn(4, 4)  # (embedding_size, output_size)
W_k = torch.randn(4, 4)  # (embedding_size, output_size)
W_v = torch.randn(4, 4)  # (embedding_size, output_size)

# Compute query (Q), key (K), and value (V) matrices
Q = torch.matmul(embeddings, W_q)
K = torch.matmul(embeddings, W_k)
V = torch.matmul(embeddings, W_v)

# Compute scaled dot-product attention
d_k = Q.size(-1)  # dimension of the keys
**# 아래 식이 바로 QK^T/root(d_k)
# Q = [batch size, length of query, dimension for each query vector]
# Q = [N, L_q, D]
# K = [N, L_k, D] --> 이걸 [N,D, L_k]로 만들어주기 위해 transpose
# QK^T = [N, L_q, L_k] shape : represents similarity between each query and each key**
scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
attention_weights = F.softmax(scores, dim=-1)
**# Attention 은 softmax 값과 V를 곱한 것**
self_attention_output = torch.matmul(attention_weights, V)

print("Self-Attention Output:\\n", self_attention_output)

Cross - attention

import torch
import torch.nn.functional as F

# Sample sentences
sentence_a = "The cat sat on the mat"  # Source
sentence_b = "Le chat était sur le tapis"  # Target

# Tokenize sentences (naive tokenization for simplicity)
tokens_a = sentence_a.lower().split()
tokens_b = sentence_b.lower().split()

# Simulate an embedding layer: map each token to a random embedding
# Mapping for sentence A (source)
token_to_embedding_a = {token: torch.randn(1, 4) for token in set(tokens_a)}
embeddings_a = torch.cat([token_to_embedding_a[token] for token in tokens_a], dim=0)

# Mapping for sentence B (target)
token_to_embedding_b = {token: torch.randn(1, 4) for token in set(tokens_b)}
embeddings_b = torch.cat([token_to_embedding_b[token] for token in tokens_b], dim=0)

# Learned weight matrices (randomly initialized for this example)
**# 단지 self-attention과 차이점은 그냥 query는 다른 값을 사용하고 key=value 값은 같다**
# For sentence A (source)
W_k_a = torch.randn(4, 4)
W_v_a = torch.randn(4, 4)

# For sentence B (target)
W_q_b = torch.randn(4, 4)

# Compute query (Q) from B, key (K) and value (V) from A
Q_b = torch.matmul(embeddings_b, W_q_b)
K_a = torch.matmul(embeddings_a, W_k_a)
V_a = torch.matmul(embeddings_a, W_v_a)

# Compute scaled dot-product cross-attention
d_k = Q_b.size(-1)  # dimension of the keys
scores = torch.matmul(Q_b, K_a.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
attention_weights = F.softmax(scores, dim=-1)
cross_attention_output = torch.matmul(attention_weights, V_a)

print("Cross-Attention Output:\\n", cross_attention_output)