Buffl

NLP, Transformers & ChatGPT

CF
von Carmen F.

Write a pseudocode implementation of a transformer layer.

import torch

import torch.nn as nn

import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):

def __init__(self, embed_size, num_heads):

super(MultiHeadSelfAttention, self).__init__()

assert embed_size % num_heads == 0, "Embedding size must be divisible by number of heads"

self.embed_size = embed_size

self.num_heads = num_heads

self.head_dim = embed_size // num_heads

self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

self.fc_out = nn.Linear(num_heads * self.head_dim, embed_size)

def forward(self, values, keys, queries, mask):

N = queries.shape[0]

value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]

# Split the embedding into self.num_heads different pieces

values = values.reshape(N, value_len, self.num_heads, self.head_dim)

keys = keys.reshape(N, key_len, self.num_heads, self.head_dim)

queries = queries.reshape(N, query_len, self.num_heads, self.head_dim)

values = self.values(values)

keys = self.keys(keys)

queries = self.queries(queries)

# (N, num_heads, query_len, head_dim) * (N, num_heads, head_dim, key_len) -> (N, num_heads, query_len, key_len)

energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

if mask is not None:

energy = energy.masked_fill(mask == 0, float("-1e20"))

attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)

# (N, num_heads, query_len, key_len) * (N, num_heads, value_len, head_dim) -> (N, query_len, num_heads, head_dim)

out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(

N, query_len, self.num_heads * self.head_dim

)

return self.fc_out(out)

class TransformerBlock(nn.Module):

def __init__(self, embed_size, num_heads, dropout, forward_expansion):

super(TransformerBlock, self).__init__()

self.attention = MultiHeadSelfAttention(embed_size, num_heads)

self.norm1 = nn.LayerNorm(embed_size)

self.norm2 = nn.LayerNorm(embed_size)

self.feed_forward = nn.Sequential(

nn.Linear(embed_size, forward_expansion * embed_size),

nn.ReLU(),

nn.Linear(forward_expansion * embed_size, embed_size)

)

self.dropout = nn.Dropout(dropout)

def forward(self, value, key, query, mask):

attention = self.attention(value, key, query, mask)

x = self.dropout(self.norm1(attention + query))

forward = self.feed_forward(x)

out = self.dropout(self.norm2(forward + x))

return out

# Example usage

embed_size = 256

num_heads = 8

dropout = 0.1

forward_expansion = 4

transformer_block = TransformerBlock(embed_size, num_heads, dropout, forward_expansion)

x = torch.rand(64, 10, embed_size) # Batch size: 64, Sequence length: 10, Embedding size: 256

mask = None

out = transformer_block(x, x, x, mask)

print(out.shape) # Expected output shape: (64, 10, 256)


Experiment with a pre-trained Hugging Face transformer model on a text classification task.

import torch

from transformers import BertTokenizer, BertForSequenceClassification

from transformers import Trainer, TrainingArguments

from datasets import load_dataset

# Load dataset (using IMDB for sentiment analysis as an example)

dataset = load_dataset('imdb')

# Load pre-trained tokenizer and model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenization function

def tokenize_function(examples):

return tokenizer(examples['text'], padding='max_length', truncation=True)

# Tokenize dataset

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare data for training and evaluation

train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(2000)) # Subset for quick training

test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(500)) # Subset for quick evaluation

# Define training arguments

training_args = TrainingArguments(

output_dir='./results',

num_train_epochs=3,

per_device_train_batch_size=8,

per_device_eval_batch_size=8,

warmup_steps=500,

weight_decay=0.01,

logging_dir='./logs',

logging_steps=10,

evaluation_strategy="epoch",

save_strategy="epoch",

load_best_model_at_end=True,

)

# Define trainer

trainer = Trainer(

model=model,

args=training_args,

train_dataset=train_dataset,

eval_dataset=test_dataset,

)

# Train the model

trainer.train()

# Evaluate the model

eval_results = trainer.evaluate()

print(eval_results)

# Save the model

model.save_pretrained('./sentiment-model')

tokenizer.save_pretrained('./sentiment-model')


How is ChatGPT trained?

ChatGPT is trained using a two-step process involving supervised learning and reinforcement learning, specifically a method known as Reinforcement Learning from Human Feedback (RLHF). Here's a breakdown of the process:

1. Pretraining (Supervised Learning)

  • Objective: The model learns to predict the next word in a sentence given the preceding words.

  • Data: A large and diverse dataset composed of text from books, websites, and other written material from the internet.

  • Process:

    • The model is trained on a massive corpus of text using a technique called transformer architecture. It learns language patterns, grammar, facts, and reasoning abilities by predicting the next word in a sentence.

    • This phase results in a general understanding of language and various topics but does not specifically fine-tune the model for conversational abilities.

2. Fine-Tuning (Reinforcement Learning from Human Feedback - RLHF)

  • Objective: Improve the model's performance in generating helpful, accurate, and aligned responses.

  • Data:

    • Human trainers create demonstration data, where they engage in conversations and provide examples of good responses.

    • Additionally, human labelers rank different model outputs from a set of prompts.

  • Process:

    • Supervised Fine-Tuning: The model is first fine-tuned on the demonstration data.

    • Reward Model Training: A separate model is trained to predict human preferences based on rankings.

    • Reinforcement Learning: Using the reward model, the original language model is further fine-tuned to generate responses that maximize the predicted reward, improving response quality and alignment with human preferences.

Iterative Improvement

OpenAI iteratively improves the model by:

  • Collecting more human feedback.

  • Refining the reward model.

  • Repeating the reinforcement learning steps to enhance performance and alignment.

This training methodology allows ChatGPT to generate coherent, contextually appropriate, and helpful responses in conversations.

Author

Carmen F.

Informationen

Zuletzt geändert