
NLP, Transformers & ChatGPT

von Carmen F.

Write a pseudocode implementation of a transformer layer.

import torch

import torch.nn as nn

import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):

def __init__(self, embed_size, num_heads):

super(MultiHeadSelfAttention, self).__init__()

assert embed_size % num_heads == 0, "Embedding size must be divisible by number of heads"

self.embed_size = embed_size

self.num_heads = num_heads

self.head_dim = embed_size // num_heads

self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.fc_out = nn.Linear(num_heads * self.head_dim, embed_size)

def forward(self, values, keys, queries, mask):

N = queries.shape[0]

value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]

# Split the embedding into self.num_heads different pieces

values = values.reshape(N, value_len, self.num_heads, self.head_dim)

keys = keys.reshape(N, key_len, self.num_heads, self.head_dim)

queries = queries.reshape(N, query_len, self.num_heads, self.head_dim)

values = self.values(values)

keys = self.keys(keys)

queries = self.queries(queries)

# (N, num_heads, query_len, head_dim) * (N, num_heads, head_dim, key_len) -> (N, num_heads, query_len, key_len)

energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

if mask is not None:

energy = energy.masked_fill(mask == 0, float("-1e20"))

attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)

# (N, num_heads, query_len, key_len) * (N, num_heads, value_len, head_dim) -> (N, query_len, num_heads, head_dim)

out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(

N, query_len, self.num_heads * self.head_dim


return self.fc_out(out)

class TransformerBlock(nn.Module):

def __init__(self, embed_size, num_heads, dropout, forward_expansion):

super(TransformerBlock, self).__init__()

self.attention = MultiHeadSelfAttention(embed_size, num_heads)

self.norm1 = nn.LayerNorm(embed_size)

self.norm2 = nn.LayerNorm(embed_size)

self.feed_forward = nn.Sequential(

nn.Linear(embed_size, forward_expansion * embed_size),


nn.Linear(forward_expansion * embed_size, embed_size)


self.dropout = nn.Dropout(dropout)

def forward(self, value, key, query, mask):

attention = self.attention(value, key, query, mask)

x = self.dropout(self.norm1(attention + query))

forward = self.feed_forward(x)

out = self.dropout(self.norm2(forward + x))

return out

# Example usage

embed_size = 256

num_heads = 8

dropout = 0.1

forward_expansion = 4

transformer_block = TransformerBlock(embed_size, num_heads, dropout, forward_expansion)

x = torch.rand(64, 10, embed_size) # Batch size: 64, Sequence length: 10, Embedding size: 256

mask = None

out = transformer_block(x, x, x, mask)

print(out.shape) # Expected output shape: (64, 10, 256)

Experiment with a pre-trained Hugging Face transformer model on a text classification task.

import torch

from transformers import BertTokenizer, BertForSequenceClassification

from transformers import Trainer, TrainingArguments

from datasets import load_dataset

# Load dataset (using IMDB for sentiment analysis as an example)

dataset = load_dataset('imdb')

# Load pre-trained tokenizer and model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenization function

def tokenize_function(examples):

return tokenizer(examples['text'], padding='max_length', truncation=True)

# Tokenize dataset

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare data for training and evaluation

train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(2000)) # Subset for quick training

test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(500)) # Subset for quick evaluation

# Define training arguments

training_args = TrainingArguments(













# Define trainer

trainer = Trainer(






# Train the model


# Evaluate the model

eval_results = trainer.evaluate()


# Save the model



How is ChatGPT trained?

ChatGPT is trained using a two-step process involving supervised learning and reinforcement learning, specifically a method known as Reinforcement Learning from Human Feedback (RLHF). Here's a breakdown of the process:

1. Pretraining (Supervised Learning)

  • Objective: The model learns to predict the next word in a sentence given the preceding words.

  • Data: A large and diverse dataset composed of text from books, websites, and other written material from the internet.

  • Process:

    • The model is trained on a massive corpus of text using a technique called transformer architecture. It learns language patterns, grammar, facts, and reasoning abilities by predicting the next word in a sentence.

    • This phase results in a general understanding of language and various topics but does not specifically fine-tune the model for conversational abilities.

2. Fine-Tuning (Reinforcement Learning from Human Feedback - RLHF)

  • Objective: Improve the model's performance in generating helpful, accurate, and aligned responses.

  • Data:

    • Human trainers create demonstration data, where they engage in conversations and provide examples of good responses.

    • Additionally, human labelers rank different model outputs from a set of prompts.

  • Process:

    • Supervised Fine-Tuning: The model is first fine-tuned on the demonstration data.

    • Reward Model Training: A separate model is trained to predict human preferences based on rankings.

    • Reinforcement Learning: Using the reward model, the original language model is further fine-tuned to generate responses that maximize the predicted reward, improving response quality and alignment with human preferences.

Iterative Improvement

OpenAI iteratively improves the model by:

  • Collecting more human feedback.

  • Refining the reward model.

  • Repeating the reinforcement learning steps to enhance performance and alignment.

This training methodology allows ChatGPT to generate coherent, contextually appropriate, and helpful responses in conversations.


Carmen F.


Zuletzt geändert