Knowledge Graph Completion Tutorial

Introduction

In this tutorial demo, we will use the Graph4NLP library to build a GNN-based knowledge graph completion model. The model consists of

  • graph embedding module (e.g., GGNN)

  • predictoin module (e.g., DistMult decoder)

We will use the built-in Graph2Seq model APIs to build the model, and evaluate it on the Kinship dataset. The full example can be downloaded from knowledge graph completion notebook

Environment setup

Create virtual environment

  • conda create –name g4l python=3.7

  • conda activate g4l

Install graph4nlp library via pip

Ensure that at least PyTorch (>=1.6.0) is installed:

$ python -c "import torch; print(torch.__version__)"
>>> 1.6.0

Find the CUDA version PyTorch was installed with (for GPU users):

$ python -c "import torch; print(torch.version.cuda)"
>>> 10.2

Install the relevant dependencies:

torchtext is needed since Graph4NLP relies on it to implement embeddings. Please pay attention to the PyTorch requirements before installing torchtext with the following script! For detailed version matching please refer here.

pip install torchtext # >=0.7.0

Install Graph4NLP

pip install graph4nlp${CUDA}

where ${CUDA} should be replaced by the specific CUDA version (none (CPU version), "-cu92", "-cu101", "-cu102", "-cu110"). The following table shows the concrete command lines. For CUDA 11.1 users, please refer to Installation via source code.

Platform

Command

CPU

pip install graph4nlp

CUDA 9.2

pip install graph4nlp-cu92

CUDA 10.1

pip install graph4nlp-cu101

CUDA 10.2

pip install graph4nlp-cu102

CUDA 11.0

pip install graph4nlp-cu110

Installation for KGC

  • Run the preprocessing script for WN18RR and Kinship: sh kg_completion/preprocess.sh

  • You can now run the model

Import packages

import argparse
import os
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
from torch.utils.data import DataLoader

from graph4nlp.pytorch.datasets.kinship import KinshipDataset
from graph4nlp.pytorch.modules.utils.config_utils import get_yaml_config

from model import Complex, ConvE, Distmult, GCNComplex, GCNDistMult, GGNNDistMult

np.set_printoptions(precision=3)
cudnn.benchmark = True

Build Model

class KGC(nn.Module):
    def __init__(self, cfg, num_entities, num_relations):
        super(KGC, self).__init__()
        self.cfg = cfg
        self.num_entities = num_entities
        self.num_relations = num_relations
        if cfg["model"] is None:
            model = ConvE(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "conve":
            model = ConvE(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "distmult":
            model = Distmult(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "complex":
            model = Complex(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "ggnn_distmult":
            model = GGNNDistMult(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "gcn_distmult":
            model = GCNDistMult(argparse.Namespace(**cfg), num_entities, num_relations)
        elif cfg["model"] == "gcn_complex":
            model = GCNComplex(argparse.Namespace(**cfg), num_entities, num_relations)
        else:
            raise Exception("Unknown model type!")

        self.model = model

    def init(self):
        return self.model.init()

    def forward(self, e1_tensor, rel_tensor, KG_graph):
        return self.model(e1_tensor, rel_tensor, KG_graph)

    def loss(self, pred, e2_multi):
        return self.model.loss(pred, e2_multi)

    def inference_forward(self, collate_data, KG_graph):
        e1_tensor = collate_data["e1_tensor"]
        rel_tensor = collate_data["rel_tensor"]
        if self.cfg["cuda"]:
            e1_tensor = e1_tensor.to("cuda")
            rel_tensor = rel_tensor.to("cuda")
        return self.model(e1_tensor, rel_tensor, KG_graph)

    def post_process(self, logits, e2=None):
        max_values, argsort1 = torch.sort(logits, 1, descending=True)
        rank1 = np.where(argsort1.cpu().numpy()[0] == e2[0, 0].item())[0][0]

        print("ground truth e2 rank = {}".format(rank1 + 1))

        # argsort1 = argsort1.cpu().numpy()
        return argsort1[:, 0].item()

Define Evaluation for KG Completion

This part we follow the implementaion of ConvE.

def ranking_and_hits_this(cfg, model, dev_rank_batcher, vocab, name, kg_graph=None):
    print("")
    print("-" * 50)
    print(name)
    print("-" * 50)
    print("")
    hits_left = []
    hits_right = []
    hits = []
    ranks = []
    ranks_left = []
    ranks_right = []
    for _ in range(10):
        hits_left.append([])
        hits_right.append([])
        hits.append([])

    for i, str2var in enumerate(dev_rank_batcher):
        e1 = str2var["e1_tensor"]
        e2 = str2var["e2_tensor"]
        rel = str2var["rel_tensor"]
        rel_reverse = str2var["rel_eval_tensor"]
        e2_multi1 = str2var["e2_multi1"].float()
        e2_multi2 = str2var["e2_multi2"].float()
        if cfg["cuda"]:
            e1 = e1.to("cuda")
            e2 = e2.to("cuda")
            rel = rel.to("cuda")
            rel_reverse = rel_reverse.to("cuda")
            e2_multi1 = e2_multi1.to("cuda")
            e2_multi2 = e2_multi2.to("cuda")

        pred1 = model(e1, rel, kg_graph)
        pred2 = model(e2, rel_reverse, kg_graph)
        pred1, pred2 = pred1.data, pred2.data
        e1, e2 = e1.data, e2.data
        e2_multi1, e2_multi2 = e2_multi1.data, e2_multi2.data
        for i in range(e1.shape[0]):
            # these filters contain ALL labels
            filter1 = e2_multi1[i].long()
            filter2 = e2_multi2[i].long()

            # save the prediction that is relevant
            target_value1 = pred1[i, e2[i, 0].item()].item()
            target_value2 = pred2[i, e1[i, 0].item()].item()
            # zero all known cases (this are not interesting)
            # this corresponds to the filtered setting
            pred1[i][filter1] = 0.0
            pred2[i][filter2] = 0.0
            # write base the saved values
            pred1[i][e2[i]] = target_value1
            pred2[i][e1[i]] = target_value2

        # sort and rank
        max_values, argsort1 = torch.sort(pred1, 1, descending=True)
        max_values, argsort2 = torch.sort(pred2, 1, descending=True)

        argsort1 = argsort1.cpu().numpy()
        argsort2 = argsort2.cpu().numpy()
        for i in range(e1.shape[0]):
            # find the rank of the target entities
            rank1 = np.where(argsort1[i] == e2[i, 0].item())[0][0]
            rank2 = np.where(argsort2[i] == e1[i, 0].item())[0][0]
            # rank+1, since the lowest rank is rank 1 not rank 0
            ranks.append(rank1 + 1)
            ranks_left.append(rank1 + 1)
            ranks.append(rank2 + 1)
            ranks_right.append(rank2 + 1)

            # this could be done more elegantly, but here you go
            for hits_level in range(10):
                if rank1 <= hits_level:
                    hits[hits_level].append(1.0)
                    hits_left[hits_level].append(1.0)
                else:
                    hits[hits_level].append(0.0)
                    hits_left[hits_level].append(0.0)

                if rank2 <= hits_level:
                    hits[hits_level].append(1.0)
                    hits_right[hits_level].append(1.0)
                else:
                    hits[hits_level].append(0.0)
                    hits_right[hits_level].append(0.0)

        # dev_rank_batcher.state.loss = [0]

    for i in range(10):
        print("Hits left @{0}: {1}".format(i + 1, np.mean(hits_left[i])))
        print("Hits right @{0}: {1}".format(i + 1, np.mean(hits_right[i])))
        print("Hits @{0}: {1}".format(i + 1, np.mean(hits[i])))
    print("Mean rank left: {0}".format(np.mean(ranks_left)))
    print("Mean rank right: {0}".format(np.mean(ranks_right)))
    print("Mean rank: {0}".format(np.mean(ranks)))
    print("Mean reciprocal rank left: {0}".format(np.mean(1.0 / np.array(ranks_left))))
    print("Mean reciprocal rank right: {0}".format(np.mean(1.0 / np.array(ranks_right))))
    print("Mean reciprocal rank: {0}".format(np.mean(1.0 / np.array(ranks))))

    return np.mean(1.0 / np.array(ranks))

Define Main()

Next, let’s build a main() function which will do a bunch of things including setting up dataset, dataloader, whole KG, model, optimizer, evaluation metrics, train/val/test loops, and so on.

In particular, users need to set the preprocess field in config file to be True if they run the code for the first time to build the whole KG.

Users can set resume field in config file to be True to load a pre-trained model.

def main(cfg, model_path):
    dataset = KinshipDataset(
        root_dir="examples/pytorch/kg_completion/data/{}".format(cfg["dataset"]),
        topology_subdir="kgc",
    )

    train_dataloader = DataLoader(
        dataset.train,
        batch_size=cfg["batch_size"],
        shuffle=True,
        num_workers=cfg['loader_threads'],
        collate_fn=dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        dataset.val,
        batch_size=cfg["batch_size"],
        shuffle=False,
        num_workers=cfg['loader_threads'],
        collate_fn=dataset.collate_fn,
    )
    test_dataloader = DataLoader(
        dataset.test,
        batch_size=cfg["batch_size"],
        shuffle=False,
        num_workers=cfg['loader_threads'],
        collate_fn=dataset.collate_fn,
    )

    data = []
    rows = []
    columns = []
    num_entities = len(dataset.vocab_model.in_word_vocab)
    num_relations = len(dataset.vocab_model.out_word_vocab)

    if cfg["preprocess"]:
        for i, str2var in enumerate(train_dataloader):
            print("batch number:", i)
            for j in range(str2var["e1"].shape[0]):
                for k in range(str2var["e2_multi1"][j].shape[0]):
                    if str2var["e2_multi1"][j][k] != 0:
                        data.append(str2var["rel"][j].tolist()[0])
                        rows.append(str2var["e1"][j].tolist()[0])
                        columns.append(str2var["e2_multi1"][j][k].tolist())
                    else:
                        break

        from graph4nlp.pytorch.data.data import GraphData

        KG_graph = GraphData()
        KG_graph.add_nodes(num_entities)
        for e1, rel, e2 in zip(rows, data, columns):
            KG_graph.add_edge(e1, e2)
            eid = KG_graph.edge_ids(e1, e2)[0]
            KG_graph.edge_attributes[eid]["token"] = rel

        torch.save(
            KG_graph,
            "examples/pytorch/kg_completion/data/{}/processed/kgc/KG_graph.pt".format(
                cfg["dataset"]
            ),
        )
    else:
        graph_path = "examples/pytorch/kg_completion/data/{}/processed/kgc/" "KG_graph.pt".format(
            cfg["dataset"]
        )
        KG_graph = torch.load(graph_path)

    if cfg["cuda"] is True:
        KG_graph = KG_graph.to("cuda")
    else:
        KG_graph = KG_graph.to("cpu")

    model = KGC(cfg, num_entities, num_relations)

    if cfg["cuda"] is True:
        model.to("cuda")

    if cfg["resume"]:
        model_params = torch.load(model_path)
        print(model)
        total_param_size = []
        params = [(key, value.size(), value.numel()) for key, value in model_params.items()]
        for key, size, count in params:
            total_param_size.append(count)
            print(key, size, count)
        print(np.array(total_param_size).sum())
        model.load_state_dict(model_params)
        model.eval()
        ranking_and_hits_this(
            cfg, model, test_dataloader, dataset.vocab_model, "test_evaluation", kg_graph=KG_graph
        )
        ranking_and_hits_this(
            cfg, model, val_dataloader, dataset.vocab_model, "dev_evaluation", kg_graph=KG_graph
        )
    else:
        model.init()

    best_mrr = 0

    opt = torch.optim.Adam(model.parameters(), lr=cfg["lr"], weight_decay=cfg["l2"])
    for epoch in range(cfg["epochs"]):
        model.train()
        for str2var in train_dataloader:
            opt.zero_grad()
            e1_tensor = str2var["e1_tensor"]
            rel_tensor = str2var["rel_tensor"]
            e2_multi = str2var["e2_multi1_binary"].float()
            if cfg["cuda"]:
                e1_tensor = e1_tensor.to("cuda")
                rel_tensor = rel_tensor.to("cuda")
                e2_multi = e2_multi.to("cuda")
            # label smoothing
            e2_multi = ((1.0 - cfg["label_smoothing"]) * e2_multi) + (1.0 / e2_multi.size(1))

            pred = model(e1_tensor, rel_tensor, KG_graph)
            loss = model.loss(pred, e2_multi)
            loss.backward()
            opt.step()

            # train_batcher.state.loss = loss.cpu()

        model.eval()
        with torch.no_grad():
            if epoch % 2 == 0 and epoch > 0:
                dev_mrr = ranking_and_hits_this(
                    cfg,
                    model,
                    val_dataloader,
                    dataset.vocab_model,
                    "dev_evaluation",
                    kg_graph=KG_graph,
                )
                if dev_mrr > best_mrr:
                    best_mrr = dev_mrr
                    print("saving best model to {0}".format(model_path))
                    torch.save(model.state_dict(), model_path)
            if epoch % 2 == 0:
                if epoch > 0:
                    ranking_and_hits_this(
                        cfg,
                        model,
                        test_dataloader,
                        dataset.vocab_model,
                        "test_evaluation",
                        kg_graph=KG_graph,
                    )

Run the model

cfg = get_args()
task_args = get_yaml_config(cfg["task_config"])

task_args["cuda"] = True

model_name = "{2}_{3}_{0}_{1}".format(
    task_args["input_drop"], task_args["hidden_drop"], task_args["model"], task_args["direction_option"]
)
model_path = "examples/pytorch/kg_completion/saved_models/{0}_{1}.model".format(
    task_args["dataset"], model_name
)

torch.manual_seed(task_args["seed"])
main(task_args, model_path)

Results on kinship

BCELoss+GGNNDistmult

Metrics

uni

bi_fuse

bi_sep

Hits @1

40.4

39.4

38.2

Hits @10

88.3

88.8

88.9

MRR

54.9

54.8

53.4