^ _ ^

Overview

Methods based on Neural Netword Language Model or Word2Vec Pretrained both use co-occurance information as the signal of Self-Supervised Learning. Besides these ways, another way to estimate word vector is Matrix Decomposition.
Firstly, doing statistic analysis for corpus, then get a matrix called Word-Context Matrix containing global statistic information.
Secondly, using Singlular Value Decomposition(SVD) to decompose matrix, then get low dimension representation of words.

This method is called Global Vectors for Word Representation().

Pretrained Task

The core idea of GloVe is use word-contexr co-occurrence matrix to realize predicting. Formually, $M_{w, c} = \sum_{i}\frac{1}{d_i(w, c)}$, where $d_i(w, c)$ indicates the distance between word $w$ and context window $c$ when the $i_{th}$ co-occurrence occurred. After calculating M, then we can calculate word vector and context vector based on the formula $v_w^T v_c + b_w + b_c = logM_{w, c}$, where $v_w$ indicates word vector, $v_c$ indicates context vector, $b$ indicates corresponding bias. Solve this equation, we can get the vector representation of word and context.

Code

Dataset

class GloveDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2):
        self.cooccur_counts = defaultdict(float)
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence)-1):
                w = sentence[i]
                left_contexts = sentence[max(0, i-context_size): i]
                right_contexts = sentence[i+1: min(len(sentence), i+context_size)+1]
                # co-occurrence distance
                for k, c in enumerate(left_contexts[::-1]):
                    self.cooccur_counts[(w, c)] += 1/(k+1)
                for k, c in enumerate(right_contexts):
                    self.cooccur_counts[(w, c)] += 1/(k+1)
        self.data = [(w, c, count) for (w, c), count in self.cooccur_counts.items()]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        return self.data[i]
    def collate_fn(self, examples):
        words = torch.tensor([ex[0] for ex in examples])
        contexts = torch.tensor([ex[1] for ex in examples])
        counts = torch.tensor([ex[2] for ex in examples])
        return (words, contexts, counts)

Model

class GloveModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloveModel, self).__init__()
        # word vector
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.w_biases = nn.Embedding(vocab_size, 1)
        # context vector
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.c_biases = nn.Embedding(vocab_size, 1)
    def forward_w(self, words):
        w_embeds = self.w_embedding(words)
        w_biases = self.w_biases(words)
        return w_embeds, w_biases
    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        c_biases = self.c_biases(contexts)
        return c_embeds, c_biases

Training

m_max = 100
alpha = 0.75

corpus, vocab = load_reuters()
dataset = GloveDataset(corpus, vocab, context_size=context_size)
data_loader = get_loader(dataset, batch_size)

model = GloveModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, counts = [x.to(device) for x in batch]
        word_embeds, word_biases = model.forward_w(words)
        context_embeds, context_biases = model.forward_c(contexts)
        log_counts = torch.log(counts)
        weight_factor = torch.clamp(torch.pow(counts/m_max, alpha), max=1.0)
        optimizer.zero_grad()
        loss = (torch.sum(word_embeds * context_embeds, dim=1) + word_biases + context_biases - log_counts)**2
        wavg_loss = (weight_factor * loss).mean()
        wavg_loss.backward()
        optimizer.step()
        total_loss += wavg_loss.item()
    print(f"Loss: {total_loss:.2f}")

combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "glovec.vec")

摸鱼的Llunch

GloVe词向量