LLM 输出重复本身

Question

我目前正在关注this教程，制作一个基本的LLM，它会像莎士比亚一样的文本（变压器的完整代码在最后）。我已经到了最后，但是当我训练它并获得输出时，输出只是不断重复相同的内容。这是我的代码

import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from functions.encode import encode_chars
from functions.character_amount import character_amount
from functions.train_test_split import train_test_split
from functions.decoding import decoding
with open(r'example_shakespeare_text.txt') as file:
    file = file.read()
split = (file.split('\n'))
max_iters = 25
num_embed = 64
num_heads = 16
num_layers = 8
batch_size = 32
block_size = 128
dropout = 0.2
learning_rate = 1e-3

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

encode = tiktoken.get_encoding('gpt2')

characters = character_amount(encode=encode, split=split)
vocab_size = encode.n_vocab
    
encoded = encode_chars(split=split, encode=encode)

data = torch.tensor(encoded, dtype=torch.long)
train_data, test_data = train_test_split(data=data)

def array_creation(split):
    if split == 'train':
        data = train_data
    else:
        data = test_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size + 1] for i in ix])
    x = x.to(device)
    y = y.to(device)
    return x, y
        
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(num_embed, head_size, bias=False)
        self.query = nn.Linear(num_embed, head_size, bias=False)
        self.value = nn.Linear(num_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        B, T, C = x.shape
        head_size = 16
        key = nn.Linear(C, head_size, bias=False)
        query = nn.Linear(C, head_size, bias=False)
        k = key(x)
        q = query(x)
        weight =  q @ k.transpose(-2,-1) * C **-0.5
        weight = weight.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)

        v = self.value(x)
        out = weight @ v
        return out

class MultiHead(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.prj = nn.Linear(num_embed, num_embed)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.prj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, num_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_embed,  4 * num_embed),
            nn.ReLU(),
            nn.Linear(4 * num_embed, num_embed),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    def __init__(self, num_embed,num_heads):
        super().__init__()
        head_size = num_embed // num_heads
        self.sa = MultiHead(num_heads, head_size)
        self.ffwd = FeedForward(num_embed)
        self.layernorm1 = nn.LayerNorm(num_embed)
        self.layernorm2 = nn.LayerNorm(num_embed)

    def forward(self, x):
        x = x + self.sa(self.layernorm1(x))
        x = x + self.ffwd(self.layernorm2(x))
        return x

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, num_embed)
        self.position_embedding_table = nn.Embedding(block_size, num_embed)
        self.blocks = nn.Sequential(*[Block(num_embed, num_heads=num_heads) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(num_embed)
        self.lm_head = nn.Linear(num_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx)
        position_embedding = self.position_embedding_table(torch.arange(T, device=device))
        x = token_emb + position_embedding
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets != None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel()
model = m.to(device)

generated_list = model.generate(idx= torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()
decoded_list = decoding(generated_list=generated_list, encode=encode)
    
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

iteration = 0
for _ in range(max_iters):
    xy, yb = array_creation('train')
    
    logits, loss = model(xy, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    iteration += 1

    print(iteration)
    print(loss.item())

context =  torch.zeros((1,1), dtype=torch.long, device=device)
print(decoding(generated_list=model.generate(context,max_new_tokens=100)[0].tolist(), encode=encode))

这是输出

A', '! re al, we hear me speak.All:Speak.First Citizen:You are all resolved rather to die to than famish?A', '! re al, we hear me speak.All:Speak.First Citizen:You are all resolved rather to die to than famish?A', '! re al, we hear me speak.All:Speak.First Citizen:You are all resolved rather to die to than famish?A', '! re al, we hear me speak.All:Speak.First Citizen:You are all resolved rather to die to than famish?

它不断重复自己，甚至比这更远。

我尝试增加输入的数据量，但这没有帮助，我还尝试更改迭代量和批量大小/块大小。但仍然没有改变重复。

我需要进行更激烈的训练吗？

Answer 1

这是我的解码功能的问题，不确定是什么，但我制作了自己的分词器而不是使用 tiktoken，它解决了问题。

LLM 输出重复本身

问题描述投票：0回答：1

1个回答

最新问题

LLM 输出重复本身

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1