From e36134c155bd568bb647fbacb2e5845de9057d12 Mon Sep 17 00:00:00 2001 From: zhongyue <37361694+seqRep@users.noreply.github.com> Date: Tue, 9 Jun 2020 17:08:30 +0800 Subject: [PATCH 01/27] full_graph_link_predictor --- .../model_zoo/full_graph_link_predictor.py | 295 ++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 python/dgllife/model/model_zoo/full_graph_link_predictor.py diff --git a/python/dgllife/model/model_zoo/full_graph_link_predictor.py b/python/dgllife/model/model_zoo/full_graph_link_predictor.py new file mode 100644 index 00000000..f87ac47a --- /dev/null +++ b/python/dgllife/model/model_zoo/full_graph_link_predictor.py @@ -0,0 +1,295 @@ +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from ogb.linkproppred import DglLinkPropPredDataset, Evaluator +from logger import Logger + +from dgl.nn.pytorch import GraphConv +from dgl.nn.pytorch.conv import SAGEConv + +class GCN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + out_feats, + num_layers, + dropout): + super(GCN, self).__init__() + + self.layers = nn.ModuleList() + # input layer + self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) + # output layer + self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) + self.dropout = nn.Dropout(p=dropout) + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, x): + for i, layer in enumerate(self.layers): + if i != 0: + x = self.dropout(x) + x = layer(g, x) + return x + + +class SAGE(nn.Module): + def __init__(self, + in_feats, + n_hidden, + out_feats, + num_layers, + dropout, + activation=F.relu): + super(SAGE, self).__init__() + self.layers = nn.ModuleList() + + # input layer + self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=activation)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=activation)) + # output layer + self.layers.append(SAGEConv(n_hidden, out_feats, "mean", feat_drop=dropout, activation=None)) # activation None + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, x): + for layer in self.layers: + x = layer(g, x) + return x + + +class LinkPredictor(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, num_layers, + dropout): + super(LinkPredictor, self).__init__() + + self.lins = nn.ModuleList() + self.lins.append(nn.Linear(in_channels, hidden_channels)) + for _ in range(num_layers - 2): + self.lins.append(nn.Linear(hidden_channels, hidden_channels)) + self.lins.append(nn.Linear(hidden_channels, out_channels)) + + self.dropout = nn.Dropout(dropout) + + def reset_parameters(self): + for layer in self.lins: + layer.reset_parameters() + + def forward(self, x_i, x_j): + x = x_i * x_j + for lin in self.lins[:-1]: + x = lin(x) + x = F.relu(x) + x = self.dropout(x) + x = self.lins[-1](x) + return torch.sigmoid(x) + + +def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): + model.train() + predictor.train() + + pos_train_edge = splitted_edge['train']['edge'].to(x.device) + + total_loss = total_samples = 0 + for perm in DataLoader( + range(pos_train_edge.size(0)), batch_size, shuffle=True): + + h = model(g, x) + + edge = pos_train_edge[perm].t() + pos_out = predictor(h[edge[0]], h[edge[1]]) + pos_loss = -torch.log(pos_out + 1e-15).mean() + + # Just do some trivial random sampling. + edge = torch.randint( + 0, x.size(0), edge.size(), dtype=torch.long, device=x.device) + + neg_out = predictor(h[edge[0]], h[edge[1]]) + neg_loss = -torch.log(1 - neg_out + 1e-15).mean() + + loss = pos_loss + neg_loss + optimizer.zero_grad() + loss.backward() + optimizer.step() + + num_samples = pos_out.size(0) + total_loss += loss.item() * num_samples + total_samples += num_samples + + return total_loss / total_samples + + +@torch.no_grad() +def test(model, predictor, g, x, splitted_edge, evaluator, batch_size): + model.eval() + + h = model(g, x) + + pos_train_edge = splitted_edge['train']['edge'].to(x.device) + pos_valid_edge = splitted_edge['valid']['edge'].to(x.device) + neg_valid_edge = splitted_edge['valid']['edge_neg'].to(x.device) + pos_test_edge = splitted_edge['test']['edge'].to(x.device) + neg_test_edge = splitted_edge['test']['edge_neg'].to(x.device) + + # Positive training edges + pos_train_preds = [] + for perm in DataLoader(range(pos_train_edge.size(0)), batch_size=batch_size): + edge = pos_train_edge[perm].t() + pos_train_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_train_preds = torch.cat(pos_train_preds, dim=0) + + # Positive validation edges + pos_valid_preds = [] + for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size=batch_size): + edge = pos_valid_edge[perm].t() + pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_valid_preds = torch.cat(pos_valid_preds, dim=0) + + # Negative validation edges + neg_valid_preds = [] + for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size=batch_size): + edge = neg_valid_edge[perm].t() + neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_valid_preds = torch.cat(neg_valid_preds, dim=0) + + # Positive test edges + pos_test_preds = [] + for perm in DataLoader(range(pos_test_edge.size(0)), batch_size=batch_size): + edge = pos_test_edge[perm].t() + pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_test_preds = torch.cat(pos_test_preds, dim=0) + + # Negative test edges + neg_test_preds = [] + for perm in DataLoader(range(neg_test_edge.size(0)), batch_size=batch_size): + edge = neg_test_edge[perm].t() + neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_test_preds = torch.cat(neg_test_preds, dim=0) + + results = {} + for K in [10, 50, 100]: + evaluator.K = K + train_hits = evaluator.eval({ + 'y_pred_pos': pos_train_preds, + 'y_pred_neg': neg_valid_preds + })[f'hits@{K}'] + valid_hits = evaluator.eval({ + 'y_pred_pos': pos_valid_preds, + 'y_pred_neg': neg_valid_preds + })[f'hits@{K}'] + test_hits = evaluator.eval({ + 'y_pred_pos': pos_test_preds, + 'y_pred_neg': neg_test_preds + })[f'hits@{K}'] + + results[f'Hits@{K}'] = (train_hits, valid_hits, test_hits) + + return results + + +def main(): + parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') + parser.add_argument('--device', type=int, default=0) + parser.add_argument('--log_steps', type=int, default=1) + parser.add_argument('--use_node_embedding', action='store_true') + parser.add_argument('--use_sage', action='store_true') + parser.add_argument('--num_layers', type=int, default=3) + parser.add_argument('--hidden_channels', type=int, default=256) + parser.add_argument('--dropout', type=float, default=0.0) + parser.add_argument('--batch_size', type=int, default=64 * 1024) + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--eval_steps', type=int, default=1) + parser.add_argument('--runs', type=int, default=10) + args = parser.parse_args() + print(args) + + device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' + device = torch.device(device) + + dataset = DglLinkPropPredDataset(name='ogbl-ppa') + # Get DGLGraph + data = dataset[0] + data.readonly(False) + data.add_edges(data.nodes(), data.nodes()) + splitted_edge = dataset.get_edge_split() + + if args.use_node_embedding: + # Todo: prepare node embeddings using node2vec + x = data.ndata['feat'].float() + x = torch.cat([x, torch.load('embedding.pt')], dim=-1) + x = x.to(device) + else: + x = data.ndata['feat'].float().to(device) + + if args.use_sage: + model = SAGE( + x.size(-1), args.hidden_channels, args.hidden_channels, + args.num_layers, args.dropout).to(device) + else: + model = GCN( + x.size(-1), args.hidden_channels, args.hidden_channels, + args.num_layers, args.dropout).to(device) + + predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, + args.num_layers, args.dropout).to(device) + + evaluator = Evaluator(name='ogbl-ppa') + loggers = { + 'Hits@10': Logger(args.runs, args), + 'Hits@50': Logger(args.runs, args), + 'Hits@100': Logger(args.runs, args), + } + + for run in range(args.runs): + model.reset_parameters() + predictor.reset_parameters() + optimizer = torch.optim.Adam( + list(model.parameters()) + list(predictor.parameters()), + lr=args.lr) + + for epoch in range(1, 1 + args.epochs): + loss = train(model, predictor, data, x, splitted_edge, optimizer, + args.batch_size) + + if epoch % args.eval_steps == 0: + results = test(model, predictor, data, x, splitted_edge, + evaluator, args.batch_size) + for key, result in results.items(): + loggers[key].add_result(run, result) + + if epoch % args.log_steps == 0: + for key, result in results.items(): + train_hits, valid_hits, test_hits = result + print(key) + print(f'Run: {run + 1:02d}, ' + f'Epoch: {epoch:02d}, ' + f'Loss: {loss:.4f}, ' + f'Train: {100 * train_hits:.2f}%, ' + f'Valid: {100 * valid_hits:.2f}%, ' + f'Test: {100 * test_hits:.2f}%') + + for key in loggers.keys(): + print(key) + loggers[key].print_statistics(run) + + for key in loggers.keys(): + print(key) + loggers[key].print_statistics() + + +if __name__ == "__main__": + main() From bfda28a351ea73c6c03639abffc3b19fcb693f34 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 16:59:18 +0800 Subject: [PATCH 02/27] Create gcn_link_predictor.py --- .../model/model_zoo/gcn_link_predictor.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 python/dgllife/model/model_zoo/gcn_link_predictor.py diff --git a/python/dgllife/model/model_zoo/gcn_link_predictor.py b/python/dgllife/model/model_zoo/gcn_link_predictor.py new file mode 100644 index 00000000..f78a4ad7 --- /dev/null +++ b/python/dgllife/model/model_zoo/gcn_link_predictor.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Link prediction with GCN model for graphs. + + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from dgl.nn.pytorch import GraphConv + +class GCN(nn.Module): + r"""GCN from `Semi-Supervised Classification with Graph Convolutional Networks + `__ + + Parameters + ---------- + in_feats : int + Number of input node features. + n_hidden : int + Number of units in the hidden layers. + out_feats : int + Number of out node features. + num_layers : int + Number of GCN layers. + dropout : float + The probability for dropout. + By default, no dropout is performed for input layer. + """ + + def __init__(self, + in_feats, + n_hidden, + out_feats, + num_layers, + dropout): + super(GCN, self).__init__() + + self.layers = nn.ModuleList() + # input layer + self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) + # output layer + self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) + + self.dropout = nn.Dropout(p=dropout) + + def reset_parameters(self): + # Reset the parameters of the GCN layers + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, feats): + """Update node representations. + + Parameters + ---------- + g : DGLGraph + DGLGraph for a batch of graphs + feats : FloatTensor of shape (N, M1) + * N is the total number of nodes in the batch of graphs + * M1 is the input node feature size, which equals in_feats in initialization + + Returns + ------- + feats : FloatTensor of shape (N, M2) + * N is the total number of nodes in the batch of graphs + * M2 is the output node representation size, which equals + out_feats in initialization. + """ + for i, layer in enumerate(self.layers): + if i != 0: + feats = self.dropout(feats) + feats = layer(g, feats) + return feats + +class GCNLinkPredictor(nn.Module): + """Link prediction with GCN model for graphs. + + GCN is introduced in `Semi-Supervised Classification with Graph Convolutional Networks + `__. This model is based on GCN and can be used + for link prediction on graphs. + + After updating node representations, we feed the product of the two node representations + of the predicted edge into the Linear layers for link prediction. + + Parameters + ---------- + in_channels : int + Number of channels in the input layer, which equals + the output node representation size of the GCN model. + hidden_channels : int + Number of units in the hidden layers. + num_layers : int + Number of Linear layers. + dropout : float + The probability for dropout. + By default, no dropout is performed for out layer. + """ + def __init__(self, + in_channels, + hidden_channels, + num_layers, + dropout): + super(GCNLinkPredictor, self).__init__() + + self.lins = nn.ModuleList() + # input layer + self.lins.append(nn.Linear(in_channels, hidden_channels)) + # hidden layers + for _ in range(num_layers - 2): + self.lins.append(nn.Linear(hidden_channels, hidden_channels)) + # out layer + self.lins.append(nn.Linear(hidden_channels, 1)) + + self.dropout = nn.Dropout(dropout) + + def reset_parameters(self): + # Reset the parameters of the Linear layers + for layer in self.lins: + layer.reset_parameters() + + def forward(self, x_i, x_j): + """Link prediction. + + Parameters + ---------- + x_i, x_j : FloatTensor of shape (B,M2) + * Representation of the two nodes of the predicted edge. + * B is the number of predicted edges in the batch. + * M2 is the node feature size. + + Returns + ------- + lp : FloatTensor of shape (B,1) + * The result of link prediction after sigmoid. + * B is the number of predicted edges in the batch. + """ + x = x_i * x_j + for lin in self.lins[:-1]: + x = lin(x) + x = F.relu(x) + x = self.dropout(x) + x = self.lins[-1](x) + lp = torch.sigmoid(x) + return lp From ced4414186638d80049f6f90fba3a0bf60a85a32 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 17:01:13 +0800 Subject: [PATCH 03/27] Create sage_link_predictor.py --- .../model/model_zoo/sage_link_predictor.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 python/dgllife/model/model_zoo/sage_link_predictor.py diff --git a/python/dgllife/model/model_zoo/sage_link_predictor.py b/python/dgllife/model/model_zoo/sage_link_predictor.py new file mode 100644 index 00000000..22f0eedd --- /dev/null +++ b/python/dgllife/model/model_zoo/sage_link_predictor.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Link prediction with GraphSAGE model for graphs. + + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from dgl.nn.pytorch.conv import SAGEConv + +class SAGE(nn.Module): + r"""GraphSAGE from `Inductive Representation Learning on Large Graphs + `__ + + Parameters + ---------- + in_feats : int + Number of input node features. + n_hidden : int + Number of units in the hidden layers. + out_feats : int + Number of out node features. + num_layers : int + Number of SAGE layers. + dropout : float + The probability for dropout. + By default, no dropout is performed for input layer. + """ + + def __init__(self, + in_feats, + n_hidden, + out_feat, + num_layers, + dropout): + super(SAGE, self).__init__() + + self.layers = nn.ModuleList() + # input layer + self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=F.relu)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=F.relu)) + # output layer + self.layers.append(SAGEConv(n_hidden, out_feat, "mean", feat_drop=dropout, activation=None)) # activation None + + def reset_parameters(self): + # Reset the parameters of the SAGE layers + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, feats): + """Update node representations. + + Parameters + ---------- + g : DGLGraph + DGLGraph for a batch of graphs + feats : FloatTensor of shape (N, M1) + * N is the total number of nodes in the batch of graphs + * M1 is the input node feature size, which equals in_feats in initialization + + Returns + ------- + feats : FloatTensor of shape (N, M2) + * N is the total number of nodes in the batch of graphs + * M2 is the output node representation size, which equals + out_feats in initialization. + """ + for layer in self.layers: + feats = layer(g, feats) + return feats + +class SAGELinkPredictor(nn.Module): + """Link prediction with GraphSAGE model for graphs. + + GraphSAGE is introduced in `Inductive Representation Learning on Large Graphs + `__. This model is based on GraphSAGE and can be used + for link prediction on graphs. + + After updating node representations, we feed the product of the two node representations + of the predicted edge into the Linear layers for link prediction. + + Parameters + ---------- + in_channels : int + Number of channels in the input layer, which equals + the output node representation size of the GraphSAGE model. + hidden_channels : int + Number of units in the hidden layers. + num_layers : int + Number of Linear layers. + dropout : float + The probability for dropout. + By default, no dropout is performed for out layer. + """ + def __init__(self, in_channels, hidden_channels, out_channels, num_layers, + dropout): + super(SAGELinkPredictor, self).__init__() + + self.lins = nn.ModuleList() + # input layer + self.lins.append(nn.Linear(in_channels, hidden_channels)) + # hidden layers + for _ in range(num_layers - 2): + self.lins.append(nn.Linear(hidden_channels, hidden_channels)) + # out layer + self.lins.append(nn.Linear(hidden_channels, 1)) + + self.dropout = nn.Dropout(dropout) + + def reset_parameters(self): + # Reset the parameters of the Linear layers + for layer in self.lins: + layer.reset_parameters() + + def forward(self, x_i, x_j): + """Link prediction. + + Parameters + ---------- + x_i, x_j : FloatTensor of shape (B,M2) + * Representation of the two nodes of the predicted edge. + * B is the number of predicted edges in the batch. + * M2 is the node feature size. + + Returns + ------- + lp : FloatTensor of shape (B,1) + * The result of link prediction after sigmoid. + * B is the number of predicted edges in the batch. + """ + x = x_i * x_j + for lin in self.lins[:-1]: + x = lin(x) + x = F.relu(x) + x = self.dropout(x) + x = self.lins[-1](x) + lp = torch.sigmoid(x) + return lp From 2a5c8adc98a540d6ac264b026b2891f421647034 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 17:02:13 +0800 Subject: [PATCH 04/27] Create test_link_prediction.py --- tests/model/test_link_prediction.py | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tests/model/test_link_prediction.py diff --git a/tests/model/test_link_prediction.py b/tests/model/test_link_prediction.py new file mode 100644 index 00000000..599c324f --- /dev/null +++ b/tests/model/test_link_prediction.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import dgl +import torch +import torch.nn.functional as F + +from dgl import DGLGraph + +from dgllife.model.model_zoo.gcn_link_predictor import * +from dgllife.model.model_zoo.sage_link_predictor import * + +def test_graph1(): + """Graph with node features.""" + g = DGLGraph([(0, 1), (0, 2), (1, 2)]) + return g, torch.arange(g.number_of_nodes()).float().reshape(-1, 1) + +def test_gcn_link_predictor(): + if torch.cuda.is_available(): + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + + g, node_feats = test_graph1() + g, node_feats = g.to(device), node_feats.to(device) + + # Test configured setting + gcn_model = GCN( + in_feats=node_feats.size(-1), + n_hidden=2, + out_feats=2, + num_layers=2, + dropout=0.1).to(device) + gcn_model.train() + assert gcn_model(g, node_feats).shape == torch.Size([1, 2]) + + gcn_link_predictor = GCNLinkPredictor( + in_channels=2, + hidden_channels=2, + num_layers=2, + dropout=0.1).to(device) + assert gcn_link_predictor(g, node_feats).shape == torch.Size([1, 1]) + +def test_sage_link_predictor(): + if torch.cuda.is_available(): + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + + g, node_feats = test_graph1() + g, node_feats = g.to(device), node_feats.to(device) + + # Test configured setting + sage_model = SAGE( + in_feats=node_feats.size(-1), + n_hidden=2, + out_feats=2, + num_layers=2, + dropout=0.1).to(device) + sage_model.train() + assert sage_model(g, node_feats).shape == torch.Size([1, 2]) + + sage_link_predictor = SAGELinkPredictor( + in_channels=2, + hidden_channels=2, + num_layers=2, + dropout=0.1).to(device) + assert sage_link_predictor(g, node_feats).shape == torch.Size([1, 1]) + +if __name__ == '__main__': + test_gcn_link_predictor() + test_sage_link_predictor() From 8ebcabd26d02f42bc4dd4cc45358aefc31878ed2 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 17:04:28 +0800 Subject: [PATCH 05/27] Create full_graph_link_predictor.py --- .../ogbl-ppa/full_graph_link_predictor.py | 300 ++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py diff --git a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py new file mode 100644 index 00000000..07b53ea2 --- /dev/null +++ b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from ogb.linkproppred import DglLinkPropPredDataset, Evaluator +from logger import Logger + +from dgl.nn.pytorch import GraphConv +from dgl.nn.pytorch.conv import SAGEConv + +class GCN(nn.Module): + def __init__(self, + in_feats, + n_hidden, + out_feats, + num_layers, + dropout): + super(GCN, self).__init__() + + self.layers = nn.ModuleList() + # input layer + self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) + # output layer + self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) + self.dropout = nn.Dropout(p=dropout) + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, x): + for i, layer in enumerate(self.layers): + if i != 0: + x = self.dropout(x) + x = layer(g, x) + return x + + +class SAGE(nn.Module): + def __init__(self, + in_feats, + n_hidden, + out_feats, + num_layers, + dropout, + activation=F.relu): + super(SAGE, self).__init__() + self.layers = nn.ModuleList() + + # input layer + self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=activation)) + # hidden layers + for i in range(num_layers - 2): + self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=activation)) + # output layer + self.layers.append(SAGEConv(n_hidden, out_feats, "mean", feat_drop=dropout, activation=None)) # activation None + + def reset_parameters(self): + for layer in self.layers: + layer.reset_parameters() + + def forward(self, g, x): + for layer in self.layers: + x = layer(g, x) + return x + + +class LinkPredictor(nn.Module): + def __init__(self, in_channels, hidden_channels, out_channels, num_layers, + dropout): + super(LinkPredictor, self).__init__() + + self.lins = nn.ModuleList() + self.lins.append(nn.Linear(in_channels, hidden_channels)) + for _ in range(num_layers - 2): + self.lins.append(nn.Linear(hidden_channels, hidden_channels)) + self.lins.append(nn.Linear(hidden_channels, out_channels)) + + self.dropout = nn.Dropout(dropout) + + def reset_parameters(self): + for layer in self.lins: + layer.reset_parameters() + + def forward(self, x_i, x_j): + x = x_i * x_j + for lin in self.lins[:-1]: + x = lin(x) + x = F.relu(x) + x = self.dropout(x) + x = self.lins[-1](x) + return torch.sigmoid(x) + + +def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): + model.train() + predictor.train() + + pos_train_edge = splitted_edge['train']['edge'].to(x.device) + + total_loss = total_samples = 0 + for perm in DataLoader( + range(pos_train_edge.size(0)), batch_size, shuffle=True): + + h = model(g, x) + + edge = pos_train_edge[perm].t() + pos_out = predictor(h[edge[0]], h[edge[1]]) + pos_loss = -torch.log(pos_out + 1e-15).mean() + + # Just do some trivial random sampling. + edge = torch.randint( + 0, x.size(0), edge.size(), dtype=torch.long, device=x.device) + + neg_out = predictor(h[edge[0]], h[edge[1]]) + neg_loss = -torch.log(1 - neg_out + 1e-15).mean() + + loss = pos_loss + neg_loss + optimizer.zero_grad() + loss.backward() + optimizer.step() + + num_samples = pos_out.size(0) + total_loss += loss.item() * num_samples + total_samples += num_samples + + return total_loss / total_samples + + +@torch.no_grad() +def test(model, predictor, g, x, splitted_edge, evaluator, batch_size): + model.eval() + + h = model(g, x) + + pos_train_edge = splitted_edge['train']['edge'].to(x.device) + pos_valid_edge = splitted_edge['valid']['edge'].to(x.device) + neg_valid_edge = splitted_edge['valid']['edge_neg'].to(x.device) + pos_test_edge = splitted_edge['test']['edge'].to(x.device) + neg_test_edge = splitted_edge['test']['edge_neg'].to(x.device) + + # Positive training edges + pos_train_preds = [] + for perm in DataLoader(range(pos_train_edge.size(0)), batch_size=batch_size): + edge = pos_train_edge[perm].t() + pos_train_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_train_preds = torch.cat(pos_train_preds, dim=0) + + # Positive validation edges + pos_valid_preds = [] + for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size=batch_size): + edge = pos_valid_edge[perm].t() + pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_valid_preds = torch.cat(pos_valid_preds, dim=0) + + # Negative validation edges + neg_valid_preds = [] + for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size=batch_size): + edge = neg_valid_edge[perm].t() + neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_valid_preds = torch.cat(neg_valid_preds, dim=0) + + # Positive test edges + pos_test_preds = [] + for perm in DataLoader(range(pos_test_edge.size(0)), batch_size=batch_size): + edge = pos_test_edge[perm].t() + pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_test_preds = torch.cat(pos_test_preds, dim=0) + + # Negative test edges + neg_test_preds = [] + for perm in DataLoader(range(neg_test_edge.size(0)), batch_size=batch_size): + edge = neg_test_edge[perm].t() + neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_test_preds = torch.cat(neg_test_preds, dim=0) + + results = {} + for K in [10, 50, 100]: + evaluator.K = K + train_hits = evaluator.eval({ + 'y_pred_pos': pos_train_preds, + 'y_pred_neg': neg_valid_preds + })[f'hits@{K}'] + valid_hits = evaluator.eval({ + 'y_pred_pos': pos_valid_preds, + 'y_pred_neg': neg_valid_preds + })[f'hits@{K}'] + test_hits = evaluator.eval({ + 'y_pred_pos': pos_test_preds, + 'y_pred_neg': neg_test_preds + })[f'hits@{K}'] + + results[f'Hits@{K}'] = (train_hits, valid_hits, test_hits) + + return results + + +def main(): + parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') + parser.add_argument('--device', type=int, default=0) + parser.add_argument('--log_steps', type=int, default=1) + parser.add_argument('--use_node_embedding', action='store_true') + parser.add_argument('--use_sage', action='store_true') + parser.add_argument('--num_layers', type=int, default=3) + parser.add_argument('--hidden_channels', type=int, default=256) + parser.add_argument('--dropout', type=float, default=0.0) + parser.add_argument('--batch_size', type=int, default=64 * 1024) + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--eval_steps', type=int, default=1) + parser.add_argument('--runs', type=int, default=10) + args = parser.parse_args() + print(args) + + device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' + device = torch.device(device) + + dataset = DglLinkPropPredDataset(name='ogbl-ppa') + # Get DGLGraph + data = dataset[0] + data.readonly(False) + data.add_edges(data.nodes(), data.nodes()) + splitted_edge = dataset.get_edge_split() + + if args.use_node_embedding: + # Todo: prepare node embeddings using node2vec + x = data.ndata['feat'].float() + x = torch.cat([x, torch.load('embedding.pt')], dim=-1) + x = x.to(device) + else: + x = data.ndata['feat'].float().to(device) + + if args.use_sage: + model = SAGE( + x.size(-1), args.hidden_channels, args.hidden_channels, + args.num_layers, args.dropout).to(device) + else: + model = GCN( + x.size(-1), args.hidden_channels, args.hidden_channels, + args.num_layers, args.dropout).to(device) + + predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, + args.num_layers, args.dropout).to(device) + + evaluator = Evaluator(name='ogbl-ppa') + loggers = { + 'Hits@10': Logger(args.runs, args), + 'Hits@50': Logger(args.runs, args), + 'Hits@100': Logger(args.runs, args), + } + + for run in range(args.runs): + model.reset_parameters() + predictor.reset_parameters() + optimizer = torch.optim.Adam( + list(model.parameters()) + list(predictor.parameters()), + lr=args.lr) + + for epoch in range(1, 1 + args.epochs): + loss = train(model, predictor, data, x, splitted_edge, optimizer, + args.batch_size) + + if epoch % args.eval_steps == 0: + results = test(model, predictor, data, x, splitted_edge, + evaluator, args.batch_size) + for key, result in results.items(): + loggers[key].add_result(run, result) + + if epoch % args.log_steps == 0: + for key, result in results.items(): + train_hits, valid_hits, test_hits = result + print(key) + print(f'Run: {run + 1:02d}, ' + f'Epoch: {epoch:02d}, ' + f'Loss: {loss:.4f}, ' + f'Train: {100 * train_hits:.2f}%, ' + f'Valid: {100 * valid_hits:.2f}%, ' + f'Test: {100 * test_hits:.2f}%') + + for key in loggers.keys(): + print(key) + loggers[key].print_statistics(run) + + for key in loggers.keys(): + print(key) + loggers[key].print_statistics() + + +if __name__ == "__main__": + main() From da93c7e47b38e2bab090a547d6d391d61e0b793b Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 17:06:13 +0800 Subject: [PATCH 06/27] Create logger.py --- examples/link_prediction/ogbl-ppa/logger.py | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/link_prediction/ogbl-ppa/logger.py diff --git a/examples/link_prediction/ogbl-ppa/logger.py b/examples/link_prediction/ogbl-ppa/logger.py new file mode 100644 index 00000000..b6e617ba --- /dev/null +++ b/examples/link_prediction/ogbl-ppa/logger.py @@ -0,0 +1,44 @@ +import torch + + +class Logger(object): + def __init__(self, runs, info=None): + self.info = info + self.results = [[] for _ in range(runs)] + + def add_result(self, run, result): + assert len(result) == 3 + assert run >= 0 and run < len(self.results) + self.results[run].append(result) + + def print_statistics(self, run=None): + if run is not None: + result = 100 * torch.tensor(self.results[run]) + argmax = result[:, 1].argmax().item() + print(f'Run {run + 1:02d}:') + print(f'Highest Train: {result[:, 0].max():.2f}') + print(f'Highest Valid: {result[:, 1].max():.2f}') + print(f' Final Train: {result[argmax, 0]:.2f}') + print(f' Final Test: {result[argmax, 2]:.2f}') + else: + result = 100 * torch.tensor(self.results) + + best_results = [] + for r in result: + train1 = r[:, 0].max().item() + valid = r[:, 1].max().item() + train2 = r[r[:, 1].argmax(), 0].item() + test = r[r[:, 1].argmax(), 2].item() + best_results.append((train1, valid, train2, test)) + + best_result = torch.tensor(best_results) + + print(f'All runs:') + r = best_result[:, 0] + print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}') + r = best_result[:, 1] + print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}') + r = best_result[:, 2] + print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}') + r = best_result[:, 3] + print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}') From 6bb813c4b88251c8f90d19da3bfda83a77d8c051 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 18:12:40 +0800 Subject: [PATCH 07/27] Add files via upload --- examples/link_prediction/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 examples/link_prediction/README.md diff --git a/examples/link_prediction/README.md b/examples/link_prediction/README.md new file mode 100644 index 00000000..a51271b9 --- /dev/null +++ b/examples/link_prediction/README.md @@ -0,0 +1,19 @@ +# Link Prediction +Link prediction is a task to estimate the probability of links between nodes in a graph. + +GNN-based link prediction typically consists of the following steps: +1. Construct graphs on biological networks +2. Prepare initial node (and edge) features for graphs +3. Use GNNs to update node representations of graphs +4. Compute the link representation from the product of its two updated nodes +5. Pass the link representations to a MLP for training and perform final link prediction + +## Datasets +- **ogbl-ppa**: is an undirected, unweighted graph. Nodes represent proteins from 58 different species, and edges indicate biologically meaningful associations between proteins, e.g., physical interactions, co-expression, homology or genomic neighborhood. Each node contains a 58-dimensional one-hot feature vector that indicates the species that the corresponding protein comes from.[1] + +## References + +[1] Hu W, Fey M, Zitnik M, et al. Open graph benchmark: Datasets for machine learning on graphs[J]. arXiv preprint arXiv:2005.00687, 2020. + + + From 032f8757ec337cb76fdfbcdc63fbb49219c94213 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 19:27:19 +0800 Subject: [PATCH 08/27] Add files via upload --- examples/link_prediction/ogbl-ppa/README.md | 45 +++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/link_prediction/ogbl-ppa/README.md diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md new file mode 100644 index 00000000..6d543ec8 --- /dev/null +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -0,0 +1,45 @@ +# Link Prediction for ogbl-ppa + + +## Models + +- **Graph Convolutional Networks (GCN)** [1]: Semi-Supervised Classification with Graph Convolutional Networks +- **Graph SAmple and aggreGatE (GraphSAGE)** [2]: Inductive Representation Learning on Large Graphs + +## Dependencies + +- **OGB v1.1.1** +- **DGL v0.4.3** + +## Usage + +Use `full_graph_link_predictor.py` with arguments +``` +--device, Device to use (default=0) +--log_steps, (default=1) +--use_node_embedding, Whether to use node embedding (action='store_true') +--use_sage, Whether to use GraphSAGE model (action='store_true') +--num_layers, (default=3) +--hidden_channels, (default=256) +--dropout, (default=0.0) +--batch_size, (default=64 * 1024) +--lr, Learning rate (default=0.01) +--epochs, (default=20) +--eval_steps, (default=1) +--runs, (default=1) +``` + +## Performance + +Using the default parameters, the performance of the two models on the ogbl-ppa dataset(Hits=100): + +| Method | %Training@Hits | %Validation@Hits | %Test@Hits | +| ------- | ---------------- | -------- | ------- | +| GCN | 12.87±5.07 | 12.39±4.85| 11.65±4.56 | +| GraphSAGE| 9.58±0.99| 9.44±0.96| 9.86±1.21| + +| Method | Average Time/epoch | +| ------- | -------------------------- | +| GCN | 1:23:12.86 | +| GraphSAGE| 1:28:49:46| + From b06fb19d4ef470fa92fc0a0a66942fe99852b5de Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 19:32:30 +0800 Subject: [PATCH 09/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 6d543ec8..29977c58 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -38,6 +38,7 @@ Using the default parameters, the performance of the two models on the ogbl-ppa | GCN | 12.87±5.07 | 12.39±4.85| 11.65±4.56 | | GraphSAGE| 9.58±0.99| 9.44±0.96| 9.86±1.21| + | Method | Average Time/epoch | | ------- | -------------------------- | | GCN | 1:23:12.86 | From 79309728f8ecc0952554676bf36a01dfcc191e0d Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 19:36:58 +0800 Subject: [PATCH 10/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 29977c58..2003bc75 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -3,8 +3,8 @@ ## Models -- **Graph Convolutional Networks (GCN)** [1]: Semi-Supervised Classification with Graph Convolutional Networks -- **Graph SAmple and aggreGatE (GraphSAGE)** [2]: Inductive Representation Learning on Large Graphs +- **Graph Convolutional Networks (GCN)** [1] +- **Graph SAmple and aggreGatE (GraphSAGE)** [2] ## Dependencies @@ -39,8 +39,13 @@ Using the default parameters, the performance of the two models on the ogbl-ppa | GraphSAGE| 9.58±0.99| 9.44±0.96| 9.86±1.21| + | Method | Average Time/epoch | | ------- | -------------------------- | | GCN | 1:23:12.86 | | GraphSAGE| 1:28:49:46| +## References + +[1] Kipf T N, Welling M. Semi-Supervised Classification with Graph Convolutional Networks[J]. 2016. +[2] Hamilton W L, Ying R, Leskovec J. Inductive Representation Learning on Large Graphs[J]. 2017. From 1d310df5bf426e1c79c0937c577b06fabadf4c71 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 19:37:59 +0800 Subject: [PATCH 11/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 2003bc75..10edfe1a 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -48,4 +48,5 @@ Using the default parameters, the performance of the two models on the ogbl-ppa ## References [1] Kipf T N, Welling M. Semi-Supervised Classification with Graph Convolutional Networks[J]. 2016. + [2] Hamilton W L, Ying R, Leskovec J. Inductive Representation Learning on Large Graphs[J]. 2017. From 3bec551a5f116151fc7df846f08a760678172433 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Tue, 16 Jun 2020 19:53:26 +0800 Subject: [PATCH 12/27] Delete full_graph_link_predictor.py --- .../model_zoo/full_graph_link_predictor.py | 295 ------------------ 1 file changed, 295 deletions(-) delete mode 100644 python/dgllife/model/model_zoo/full_graph_link_predictor.py diff --git a/python/dgllife/model/model_zoo/full_graph_link_predictor.py b/python/dgllife/model/model_zoo/full_graph_link_predictor.py deleted file mode 100644 index f87ac47a..00000000 --- a/python/dgllife/model/model_zoo/full_graph_link_predictor.py +++ /dev/null @@ -1,295 +0,0 @@ -import argparse -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.utils.data import DataLoader - -from ogb.linkproppred import DglLinkPropPredDataset, Evaluator -from logger import Logger - -from dgl.nn.pytorch import GraphConv -from dgl.nn.pytorch.conv import SAGEConv - -class GCN(nn.Module): - def __init__(self, - in_feats, - n_hidden, - out_feats, - num_layers, - dropout): - super(GCN, self).__init__() - - self.layers = nn.ModuleList() - # input layer - self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) - # output layer - self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) - self.dropout = nn.Dropout(p=dropout) - - def reset_parameters(self): - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, x): - for i, layer in enumerate(self.layers): - if i != 0: - x = self.dropout(x) - x = layer(g, x) - return x - - -class SAGE(nn.Module): - def __init__(self, - in_feats, - n_hidden, - out_feats, - num_layers, - dropout, - activation=F.relu): - super(SAGE, self).__init__() - self.layers = nn.ModuleList() - - # input layer - self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=activation)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=activation)) - # output layer - self.layers.append(SAGEConv(n_hidden, out_feats, "mean", feat_drop=dropout, activation=None)) # activation None - - def reset_parameters(self): - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, x): - for layer in self.layers: - x = layer(g, x) - return x - - -class LinkPredictor(nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, num_layers, - dropout): - super(LinkPredictor, self).__init__() - - self.lins = nn.ModuleList() - self.lins.append(nn.Linear(in_channels, hidden_channels)) - for _ in range(num_layers - 2): - self.lins.append(nn.Linear(hidden_channels, hidden_channels)) - self.lins.append(nn.Linear(hidden_channels, out_channels)) - - self.dropout = nn.Dropout(dropout) - - def reset_parameters(self): - for layer in self.lins: - layer.reset_parameters() - - def forward(self, x_i, x_j): - x = x_i * x_j - for lin in self.lins[:-1]: - x = lin(x) - x = F.relu(x) - x = self.dropout(x) - x = self.lins[-1](x) - return torch.sigmoid(x) - - -def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): - model.train() - predictor.train() - - pos_train_edge = splitted_edge['train']['edge'].to(x.device) - - total_loss = total_samples = 0 - for perm in DataLoader( - range(pos_train_edge.size(0)), batch_size, shuffle=True): - - h = model(g, x) - - edge = pos_train_edge[perm].t() - pos_out = predictor(h[edge[0]], h[edge[1]]) - pos_loss = -torch.log(pos_out + 1e-15).mean() - - # Just do some trivial random sampling. - edge = torch.randint( - 0, x.size(0), edge.size(), dtype=torch.long, device=x.device) - - neg_out = predictor(h[edge[0]], h[edge[1]]) - neg_loss = -torch.log(1 - neg_out + 1e-15).mean() - - loss = pos_loss + neg_loss - optimizer.zero_grad() - loss.backward() - optimizer.step() - - num_samples = pos_out.size(0) - total_loss += loss.item() * num_samples - total_samples += num_samples - - return total_loss / total_samples - - -@torch.no_grad() -def test(model, predictor, g, x, splitted_edge, evaluator, batch_size): - model.eval() - - h = model(g, x) - - pos_train_edge = splitted_edge['train']['edge'].to(x.device) - pos_valid_edge = splitted_edge['valid']['edge'].to(x.device) - neg_valid_edge = splitted_edge['valid']['edge_neg'].to(x.device) - pos_test_edge = splitted_edge['test']['edge'].to(x.device) - neg_test_edge = splitted_edge['test']['edge_neg'].to(x.device) - - # Positive training edges - pos_train_preds = [] - for perm in DataLoader(range(pos_train_edge.size(0)), batch_size=batch_size): - edge = pos_train_edge[perm].t() - pos_train_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] - pos_train_preds = torch.cat(pos_train_preds, dim=0) - - # Positive validation edges - pos_valid_preds = [] - for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size=batch_size): - edge = pos_valid_edge[perm].t() - pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] - pos_valid_preds = torch.cat(pos_valid_preds, dim=0) - - # Negative validation edges - neg_valid_preds = [] - for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size=batch_size): - edge = neg_valid_edge[perm].t() - neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] - neg_valid_preds = torch.cat(neg_valid_preds, dim=0) - - # Positive test edges - pos_test_preds = [] - for perm in DataLoader(range(pos_test_edge.size(0)), batch_size=batch_size): - edge = pos_test_edge[perm].t() - pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] - pos_test_preds = torch.cat(pos_test_preds, dim=0) - - # Negative test edges - neg_test_preds = [] - for perm in DataLoader(range(neg_test_edge.size(0)), batch_size=batch_size): - edge = neg_test_edge[perm].t() - neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] - neg_test_preds = torch.cat(neg_test_preds, dim=0) - - results = {} - for K in [10, 50, 100]: - evaluator.K = K - train_hits = evaluator.eval({ - 'y_pred_pos': pos_train_preds, - 'y_pred_neg': neg_valid_preds - })[f'hits@{K}'] - valid_hits = evaluator.eval({ - 'y_pred_pos': pos_valid_preds, - 'y_pred_neg': neg_valid_preds - })[f'hits@{K}'] - test_hits = evaluator.eval({ - 'y_pred_pos': pos_test_preds, - 'y_pred_neg': neg_test_preds - })[f'hits@{K}'] - - results[f'Hits@{K}'] = (train_hits, valid_hits, test_hits) - - return results - - -def main(): - parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') - parser.add_argument('--device', type=int, default=0) - parser.add_argument('--log_steps', type=int, default=1) - parser.add_argument('--use_node_embedding', action='store_true') - parser.add_argument('--use_sage', action='store_true') - parser.add_argument('--num_layers', type=int, default=3) - parser.add_argument('--hidden_channels', type=int, default=256) - parser.add_argument('--dropout', type=float, default=0.0) - parser.add_argument('--batch_size', type=int, default=64 * 1024) - parser.add_argument('--lr', type=float, default=0.01) - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument('--eval_steps', type=int, default=1) - parser.add_argument('--runs', type=int, default=10) - args = parser.parse_args() - print(args) - - device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' - device = torch.device(device) - - dataset = DglLinkPropPredDataset(name='ogbl-ppa') - # Get DGLGraph - data = dataset[0] - data.readonly(False) - data.add_edges(data.nodes(), data.nodes()) - splitted_edge = dataset.get_edge_split() - - if args.use_node_embedding: - # Todo: prepare node embeddings using node2vec - x = data.ndata['feat'].float() - x = torch.cat([x, torch.load('embedding.pt')], dim=-1) - x = x.to(device) - else: - x = data.ndata['feat'].float().to(device) - - if args.use_sage: - model = SAGE( - x.size(-1), args.hidden_channels, args.hidden_channels, - args.num_layers, args.dropout).to(device) - else: - model = GCN( - x.size(-1), args.hidden_channels, args.hidden_channels, - args.num_layers, args.dropout).to(device) - - predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, - args.num_layers, args.dropout).to(device) - - evaluator = Evaluator(name='ogbl-ppa') - loggers = { - 'Hits@10': Logger(args.runs, args), - 'Hits@50': Logger(args.runs, args), - 'Hits@100': Logger(args.runs, args), - } - - for run in range(args.runs): - model.reset_parameters() - predictor.reset_parameters() - optimizer = torch.optim.Adam( - list(model.parameters()) + list(predictor.parameters()), - lr=args.lr) - - for epoch in range(1, 1 + args.epochs): - loss = train(model, predictor, data, x, splitted_edge, optimizer, - args.batch_size) - - if epoch % args.eval_steps == 0: - results = test(model, predictor, data, x, splitted_edge, - evaluator, args.batch_size) - for key, result in results.items(): - loggers[key].add_result(run, result) - - if epoch % args.log_steps == 0: - for key, result in results.items(): - train_hits, valid_hits, test_hits = result - print(key) - print(f'Run: {run + 1:02d}, ' - f'Epoch: {epoch:02d}, ' - f'Loss: {loss:.4f}, ' - f'Train: {100 * train_hits:.2f}%, ' - f'Valid: {100 * valid_hits:.2f}%, ' - f'Test: {100 * test_hits:.2f}%') - - for key in loggers.keys(): - print(key) - loggers[key].print_statistics(run) - - for key in loggers.keys(): - print(key) - loggers[key].print_statistics() - - -if __name__ == "__main__": - main() From d005beb64bbd1eaea0769c2e8891149c277adcd5 Mon Sep 17 00:00:00 2001 From: mufeili Date: Tue, 23 Jun 2020 02:33:11 +0800 Subject: [PATCH 13/27] Update --- docs/source/api/model.gnn.rst | 5 + docs/source/api/model.zoo.rst | 5 + examples/link_prediction/README.md | 11 +- examples/link_prediction/ogbl-ppa/README.md | 57 +++--- .../ogbl-ppa/full_graph_link_predictor.py | 189 ++++++------------ examples/link_prediction/ogbl-ppa/logger.py | 1 - python/dgllife/model/gnn/__init__.py | 1 + python/dgllife/model/gnn/gcn.py | 6 +- python/dgllife/model/gnn/graphsage.py | 97 +++++++++ python/dgllife/model/model_zoo/__init__.py | 3 + .../model_zoo/hadamard_link_predictor.py | 92 +++++++++ tests/model/test_gnn.py | 27 +++ tests/model/test_link_prediction.py | 68 +------ 13 files changed, 340 insertions(+), 222 deletions(-) create mode 100644 python/dgllife/model/gnn/graphsage.py create mode 100644 python/dgllife/model/model_zoo/hadamard_link_predictor.py diff --git a/docs/source/api/model.gnn.rst b/docs/source/api/model.gnn.rst index e85e3e52..00213bee 100644 --- a/docs/source/api/model.gnn.rst +++ b/docs/source/api/model.gnn.rst @@ -49,6 +49,11 @@ GIN .. automodule:: dgllife.model.gnn.gin :members: +GraphSAGE +--------- +.. automodule:: dgllife.model.gnn.graphsage + :members: + WLN --- .. automodule:: dgllife.model.gnn.wln diff --git a/docs/source/api/model.zoo.rst b/docs/source/api/model.zoo.rst index da0a171f..1189d8b9 100644 --- a/docs/source/api/model.zoo.rst +++ b/docs/source/api/model.zoo.rst @@ -16,6 +16,11 @@ MLP Predictor .. automodule:: dgllife.model.model_zoo.mlp_predictor :members: +Hadamard Link Predictor +``````````````````````` +.. automodule:: dgllife.model.model_zoo.hadamard_link_predictor + :members: + Molecular Property Prediction ----------------------------- diff --git a/examples/link_prediction/README.md b/examples/link_prediction/README.md index a51271b9..395cd818 100644 --- a/examples/link_prediction/README.md +++ b/examples/link_prediction/README.md @@ -9,11 +9,12 @@ GNN-based link prediction typically consists of the following steps: 5. Pass the link representations to a MLP for training and perform final link prediction ## Datasets -- **ogbl-ppa**: is an undirected, unweighted graph. Nodes represent proteins from 58 different species, and edges indicate biologically meaningful associations between proteins, e.g., physical interactions, co-expression, homology or genomic neighborhood. Each node contains a 58-dimensional one-hot feature vector that indicates the species that the corresponding protein comes from.[1] +- **ogbl-ppa**: is an undirected, unweighted graph. Nodes represent proteins from 58 different species, +and edges indicate biologically meaningful associations between proteins, e.g., physical interactions, +co-expression, homology or genomic neighborhood. Each node contains a 58-dimensional one-hot feature +vector that indicates the species that the corresponding protein comes from.[1] ## References -[1] Hu W, Fey M, Zitnik M, et al. Open graph benchmark: Datasets for machine learning on graphs[J]. arXiv preprint arXiv:2005.00687, 2020. - - - +[1] Hu W, Fey M, Zitnik M, et al. Open graph benchmark: Datasets for machine learning on graphs[J]. +arXiv preprint arXiv:2005.00687, 2020. diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 10edfe1a..9c267810 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -1,49 +1,58 @@ # Link Prediction for ogbl-ppa +For a detailed description of the dataset, see [the OGB website](https://ogb.stanford.edu/docs/linkprop/). ## Models - **Graph Convolutional Networks (GCN)** [1] -- **Graph SAmple and aggreGatE (GraphSAGE)** [2] +- **GraphSAGE** [2] ## Dependencies -- **OGB v1.1.1** +- **OGB v1.1.1**, which can be installed with ```pip install ogb``` - **DGL v0.4.3** ## Usage -Use `full_graph_link_predictor.py` with arguments +To run with default options, simply do + +```bash +python full_graph_link_predictor.py +``` + +By default, we use CPU for computation as the graph is too large for a GPU with normal size. + +The optional arguments are as follows: + ``` ---device, Device to use (default=0) ---log_steps, (default=1) ---use_node_embedding, Whether to use node embedding (action='store_true') ---use_sage, Whether to use GraphSAGE model (action='store_true') ---num_layers, (default=3) ---hidden_channels, (default=256) +--use_gpu, use gpu for computation +--use_sage, use GraphSAGE rather than GCN +--num_layers, number of GNN layers to use as well as linear layers for final link prediction (default=3) +--hidden_feats, size for hidden representations (default=256) --dropout, (default=0.0) ---batch_size, (default=64 * 1024) ---lr, Learning rate (default=0.01) ---epochs, (default=20) ---eval_steps, (default=1) ---runs, (default=1) +--batch_size, batch size to use for link prediction (default=64 * 1024) +--lr, learning rate (default=0.01) +--epochs, number of epochs for training (default=20) +--eval_steps, evaluate hits@100 every {eval_steps} epochs (default=1) +--runs, number of random experiments to perform (default=1) ``` ## Performance -Using the default parameters, the performance of the two models on the ogbl-ppa dataset(Hits=100): - -| Method | %Training@Hits | %Validation@Hits | %Test@Hits | -| ------- | ---------------- | -------- | ------- | -| GCN | 12.87±5.07 | 12.39±4.85| 11.65±4.56 | -| GraphSAGE| 9.58±0.99| 9.44±0.96| 9.86±1.21| +For model evaluation, we consider hits@100 -- ranking each true link against 3,000,000 randomly-sampled +negative edges, and counting the ratio of positive edges that are ranked at 100-th place or above. +Using the default parameters, the performance of 10 random runs is as follows. +| Method | Train hits@100 | Validation hits@100 | Test hits@100 | +| --------- | -------------- | ------------------- | ------------- | +| GCN | 12.87 ± 5.07 | 12.39 ± 4.85 | 11.65 ± 4.56 | +| GraphSAGE | 9.58 ± 0.99 | 9.44 ± 0.96 | 9.86 ± 1.21 | -| Method | Average Time/epoch | -| ------- | -------------------------- | -| GCN | 1:23:12.86 | -| GraphSAGE| 1:28:49:46| +| Method | Average Time (hour) / epoch | +| --------- | --------------------------- | +| GCN | 1.38 | +| GraphSAGE | 1.47 | ## References diff --git a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py index 07b53ea2..dc1d75d9 100644 --- a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py +++ b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py @@ -5,102 +5,13 @@ import argparse import torch -import torch.nn as nn import torch.nn.functional as F -from torch.utils.data import DataLoader +from dgllife.model import GCN, GraphSAGE, HadamardLinkPredictor from ogb.linkproppred import DglLinkPropPredDataset, Evaluator -from logger import Logger - -from dgl.nn.pytorch import GraphConv -from dgl.nn.pytorch.conv import SAGEConv - -class GCN(nn.Module): - def __init__(self, - in_feats, - n_hidden, - out_feats, - num_layers, - dropout): - super(GCN, self).__init__() - - self.layers = nn.ModuleList() - # input layer - self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) - # output layer - self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) - self.dropout = nn.Dropout(p=dropout) - - def reset_parameters(self): - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, x): - for i, layer in enumerate(self.layers): - if i != 0: - x = self.dropout(x) - x = layer(g, x) - return x - - -class SAGE(nn.Module): - def __init__(self, - in_feats, - n_hidden, - out_feats, - num_layers, - dropout, - activation=F.relu): - super(SAGE, self).__init__() - self.layers = nn.ModuleList() - - # input layer - self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=activation)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=activation)) - # output layer - self.layers.append(SAGEConv(n_hidden, out_feats, "mean", feat_drop=dropout, activation=None)) # activation None - - def reset_parameters(self): - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, x): - for layer in self.layers: - x = layer(g, x) - return x - - -class LinkPredictor(nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, num_layers, - dropout): - super(LinkPredictor, self).__init__() - - self.lins = nn.ModuleList() - self.lins.append(nn.Linear(in_channels, hidden_channels)) - for _ in range(num_layers - 2): - self.lins.append(nn.Linear(hidden_channels, hidden_channels)) - self.lins.append(nn.Linear(hidden_channels, out_channels)) - - self.dropout = nn.Dropout(dropout) - - def reset_parameters(self): - for layer in self.lins: - layer.reset_parameters() - - def forward(self, x_i, x_j): - x = x_i * x_j - for lin in self.lins[:-1]: - x = lin(x) - x = F.relu(x) - x = self.dropout(x) - x = self.lins[-1](x) - return torch.sigmoid(x) +from torch.utils.data import DataLoader +from logger import Logger def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): model.train() @@ -115,14 +26,14 @@ def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): h = model(g, x) edge = pos_train_edge[perm].t() - pos_out = predictor(h[edge[0]], h[edge[1]]) + pos_out = torch.sigmoid(predictor(h[edge[0]], h[edge[1]])) pos_loss = -torch.log(pos_out + 1e-15).mean() # Just do some trivial random sampling. edge = torch.randint( 0, x.size(0), edge.size(), dtype=torch.long, device=x.device) - neg_out = predictor(h[edge[0]], h[edge[1]]) + neg_out = torch.sigmoid(predictor(h[edge[0]], h[edge[1]])) neg_loss = -torch.log(1 - neg_out + 1e-15).mean() loss = pos_loss + neg_loss @@ -153,35 +64,40 @@ def test(model, predictor, g, x, splitted_edge, evaluator, batch_size): pos_train_preds = [] for perm in DataLoader(range(pos_train_edge.size(0)), batch_size=batch_size): edge = pos_train_edge[perm].t() - pos_train_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_train_preds += [torch.sigmoid( + predictor(h[edge[0]], h[edge[1]])).squeeze().cpu()] pos_train_preds = torch.cat(pos_train_preds, dim=0) # Positive validation edges pos_valid_preds = [] for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size=batch_size): edge = pos_valid_edge[perm].t() - pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_valid_preds += [torch.sigmoid( + predictor(h[edge[0]], h[edge[1]])).squeeze().cpu()] pos_valid_preds = torch.cat(pos_valid_preds, dim=0) # Negative validation edges neg_valid_preds = [] for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size=batch_size): edge = neg_valid_edge[perm].t() - neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_valid_preds += [torch.sigmoid( + predictor(h[edge[0]], h[edge[1]])).squeeze().cpu()] neg_valid_preds = torch.cat(neg_valid_preds, dim=0) # Positive test edges pos_test_preds = [] for perm in DataLoader(range(pos_test_edge.size(0)), batch_size=batch_size): edge = pos_test_edge[perm].t() - pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + pos_test_preds += [torch.sigmoid( + predictor(h[edge[0]], h[edge[1]])).squeeze().cpu()] pos_test_preds = torch.cat(pos_test_preds, dim=0) # Negative test edges neg_test_preds = [] for perm in DataLoader(range(neg_test_edge.size(0)), batch_size=batch_size): edge = neg_test_edge[perm].t() - neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()] + neg_test_preds += [torch.sigmoid( + predictor(h[edge[0]], h[edge[1]])).squeeze().cpu()] neg_test_preds = torch.cat(neg_test_preds, dim=0) results = {} @@ -207,23 +123,36 @@ def test(model, predictor, g, x, splitted_edge, evaluator, batch_size): def main(): parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') - parser.add_argument('--device', type=int, default=0) - parser.add_argument('--log_steps', type=int, default=1) - parser.add_argument('--use_node_embedding', action='store_true') - parser.add_argument('--use_sage', action='store_true') - parser.add_argument('--num_layers', type=int, default=3) - parser.add_argument('--hidden_channels', type=int, default=256) - parser.add_argument('--dropout', type=float, default=0.0) - parser.add_argument('--batch_size', type=int, default=64 * 1024) - parser.add_argument('--lr', type=float, default=0.01) - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument('--eval_steps', type=int, default=1) - parser.add_argument('--runs', type=int, default=10) + parser.add_argument('--use_gpu', action='store_true', + help='Use gpu for computation (default: False)') + parser.add_argument('--log_steps', type=int, default=1, + help='Print training progress every {log_steps} epochs (default: 1)') + parser.add_argument('--use_sage', action='store_true', + help='Use GraphSAGE rather than GCN (default: False)') + parser.add_argument('--num_layers', type=int, default=3, + help='Number of GNN layers to use as well as ' + 'linear layers to use for final link prediction (default: 3)') + parser.add_argument('--hidden_feats', type=int, default=256, + help='Size for hidden representations (default: 256)') + parser.add_argument('--dropout', type=float, default=0.0, + help='Dropout (default: 0.0)') + parser.add_argument('--batch_size', type=int, default=64 * 1024, + help='Batch size to use for link prediction (default: 64 * 1024)') + parser.add_argument('--lr', type=float, default=0.01, + help='Learning rate (default: 0.01)') + parser.add_argument('--epochs', type=int, default=20, + help='Number of epochs for training (default: 20)') + parser.add_argument('--eval_steps', type=int, default=1, + help='Evaluate hits@100 every {eval_steps} epochs (default: 1)') + parser.add_argument('--runs', type=int, default=10, + help='Number of random experiments to perform (default: 10)') args = parser.parse_args() print(args) - device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' - device = torch.device(device) + if args.use_gpu and torch.cuda.is_available(): + device = torch.device('cuda:0') + else: + device = torch.device('cpu') dataset = DglLinkPropPredDataset(name='ogbl-ppa') # Get DGLGraph @@ -231,26 +160,26 @@ def main(): data.readonly(False) data.add_edges(data.nodes(), data.nodes()) splitted_edge = dataset.get_edge_split() - - if args.use_node_embedding: - # Todo: prepare node embeddings using node2vec - x = data.ndata['feat'].float() - x = torch.cat([x, torch.load('embedding.pt')], dim=-1) - x = x.to(device) - else: - x = data.ndata['feat'].float().to(device) + x = data.ndata['feat'].float().to(device) if args.use_sage: - model = SAGE( - x.size(-1), args.hidden_channels, args.hidden_channels, - args.num_layers, args.dropout).to(device) + model = GraphSAGE(in_feats=x.size(-1), + hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], + activation=[F.relu for _ in range(args.num_layers - 1)] + [None], + dropout=[0] + [args.dropout for _ in range(args.num_layers - 1)]).to(device) else: - model = GCN( - x.size(-1), args.hidden_channels, args.hidden_channels, - args.num_layers, args.dropout).to(device) - - predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, - args.num_layers, args.dropout).to(device) + model = GCN(in_feats=x.size(-1), + hidden_feats=[args.hidden_feats for _ in range(args.num_layers)], + activation=[F.relu for _ in range(args.num_layers - 1)] + [None], + residual=[False for _ in range(args.num_layers)], + batchnorm=[False for _ in range(args.num_layers)], + dropout=[args.dropout for _ in range(args.num_layers - 1)] + [0]).to(device) + + predictor = HadamardLinkPredictor(in_feats=args.hidden_feats, + hidden_feats=args.hidden_feats, + num_layers=args.num_layers, + n_tasks=1, + dropout=args.dropout).to(device) evaluator = Evaluator(name='ogbl-ppa') loggers = { diff --git a/examples/link_prediction/ogbl-ppa/logger.py b/examples/link_prediction/ogbl-ppa/logger.py index b6e617ba..17cad7f9 100644 --- a/examples/link_prediction/ogbl-ppa/logger.py +++ b/examples/link_prediction/ogbl-ppa/logger.py @@ -1,6 +1,5 @@ import torch - class Logger(object): def __init__(self, runs, info=None): self.info = info diff --git a/python/dgllife/model/gnn/__init__.py b/python/dgllife/model/gnn/__init__.py index 8fa22e7a..8e15d0f0 100644 --- a/python/dgllife/model/gnn/__init__.py +++ b/python/dgllife/model/gnn/__init__.py @@ -9,6 +9,7 @@ from .gat import * from .gcn import * from .gin import * +from .graphsage import * from .mgcn import * from .mpnn import * from .schnet import * diff --git a/python/dgllife/model/gnn/gcn.py b/python/dgllife/model/gnn/gcn.py index aa99ffd8..935d00eb 100644 --- a/python/dgllife/model/gnn/gcn.py +++ b/python/dgllife/model/gnn/gcn.py @@ -98,9 +98,9 @@ class GCN(nn.Module): ``len(hidden_feats)`` equals the number of GCN layers. By default, we use ``[64, 64]``. activation : list of activation functions or None - If None, no activation will be applied. If not None, ``activation[i]`` gives the - activation function to be used for the i-th GCN layer. ``len(activation)`` equals - the number of GCN layers. By default, ReLU is applied for all GCN layers. + If not None, ``activation[i]`` gives the activation function to be used for + the i-th GCN layer. ``len(activation)`` equals the number of GCN layers. + By default, ReLU is applied for all GCN layers. residual : list of bool ``residual[i]`` decides if residual connection is to be used for the i-th GCN layer. ``len(residual)`` equals the number of GCN layers. By default, residual connection diff --git a/python/dgllife/model/gnn/graphsage.py b/python/dgllife/model/gnn/graphsage.py new file mode 100644 index 00000000..c02a426d --- /dev/null +++ b/python/dgllife/model/gnn/graphsage.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# GraphSAGE +# pylint: disable= no-member, arguments-differ, invalid-name + +import torch.nn as nn +import torch.nn.functional as F + +from dgl.nn.pytorch import SAGEConv + +__all__ = ['GraphSAGE'] + +# pylint: disable=W0221, C0103 +class GraphSAGE(nn.Module): + r"""GraphSAGE from `Inductive Representation Learning on Large Graphs + `__ + + Parameters + ---------- + in_feats : int + Number of input node features. + hidden_feats : list of int + ``hidden_feats[i]`` gives the size of node representations after the i-th GraphSAGE layer. + ``len(hidden_feats)`` equals the number of GraphSAGE layers. By default, we use + ``[64, 64]``. + activation : list of activation functions or None + If not None, ``activation[i]`` gives the activation function to be used for + the i-th GraphSAGE layer. ``len(activation)`` equals the number of GraphSAGE layers. + By default, ReLU is applied for all GraphSAGE layers. + dropout : list of float or None + ``dropout[i]`` decides the dropout probability on the output of the i-th GraphSAGE layer. + ``len(dropout)`` equals the number of GraphSAGE layers. By default, no dropout is + performed for all layers. + aggregator_type : list of str + ``aggregator_type[i]`` decides the aggregator type for the i-th GraphSAGE layer, which + can be one of ``'mean'``, ``'gcn'``, ``'pool'``, ``'lstm'``. By default, we use + ``'mean'`` for all layers. + """ + def __init__(self, + in_feats, + hidden_feats=None, + activation=None, + dropout=None, + aggregator_type=None): + super(GraphSAGE, self).__init__() + + if hidden_feats is None: + hidden_feats = [64, 64] + + n_layers = len(hidden_feats) + if activation is None: + activation = [F.relu for _ in range(n_layers)] + if dropout is None: + dropout = [0. for _ in range(n_layers)] + if aggregator_type is None: + aggregator_type = ['mean' for _ in range(n_layers)] + lengths = [len(hidden_feats), len(activation), len(dropout), len(aggregator_type)] + assert len(set(lengths)) == 1, 'Expect the lengths of hidden_feats, activation, ' \ + 'dropout and aggregator_type to be the same, ' \ + 'got {}'.format(lengths) + + self.hidden_feats = hidden_feats + self.gnn_layers = nn.ModuleList() + for i in range(n_layers): + self.gnn_layers.append(SAGEConv(in_feats, hidden_feats[i], aggregator_type[i], + dropout[i], activation[i])) + in_feats = hidden_feats[i] + + def reset_parameters(self): + """Reinitialize model parameters.""" + for gnn in self.gnn_layers: + gnn.reset_parameters() + + def forward(self, g, feats): + """Update node representations. + + Parameters + ---------- + g : DGLGraph + DGLGraph for a batch of graphs + feats : FloatTensor of shape (N, M1) + * N is the total number of nodes in the batch of graphs + * M1 is the input node feature size, which equals in_feats in initialization + + Returns + ------- + feats : FloatTensor of shape (N, M2) + * N is the total number of nodes in the batch of graphs + * M2 is the output node representation size, which equals + hidden_sizes[-1] in initialization. + """ + for gnn in self.gnn_layers: + feats = gnn(g, feats) + return feats diff --git a/python/dgllife/model/model_zoo/__init__.py b/python/dgllife/model/model_zoo/__init__.py index acff3ca3..eb87d021 100644 --- a/python/dgllife/model/model_zoo/__init__.py +++ b/python/dgllife/model/model_zoo/__init__.py @@ -26,3 +26,6 @@ # Protein-Ligand Binding from .acnn import * + +# Link prediction +from .hadamard_link_predictor import * diff --git a/python/dgllife/model/model_zoo/hadamard_link_predictor.py b/python/dgllife/model/model_zoo/hadamard_link_predictor.py new file mode 100644 index 00000000..24a701d2 --- /dev/null +++ b/python/dgllife/model/model_zoo/hadamard_link_predictor.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Predictor for link prediction by taking elementwise multiplication of node representations + +import torch.nn as nn +import torch.nn.functional as F + +__all__ = ['HadamardLinkPredictor'] + +class HadamardLinkPredictor(nn.Module): + """Link prediction by taking the elementwise multiplication of two node representations + + The elementwise multiplication is also called Hadamard product. + + Parameters + ---------- + in_feats : int + Number of input node features + hidden_feats : int + Number of hidden features. Default to 256. + num_layers : int + Number of linear layers used in total, which should be + at least 2, counting the input and output layers. Default to 3. + n_tasks : int + Number of output tasks. Default to 1. + dropout : float + Dropout before each linear layer except for the first one. + Default to 0., i.e. no dropout is performed. + activation : callable + Activation function to apply after the output of each linear layer. + Default to ReLU. + """ + def __init__(self, + in_feats, + hidden_feats=256, + num_layers=3, + n_tasks=1, + dropout=0., + activation=F.relu): + super(HadamardLinkPredictor, self).__init__() + + assert num_layers >= 2, 'Expect num_layers to be at least 2, got {:d}'.format(num_layers) + + self.layers = nn.ModuleList() + # input layer + self.layers.append(nn.Linear(in_feats, hidden_feats)) + # hidden layers + for _ in range(num_layers - 2): + self.layers.append(nn.Linear(hidden_feats, hidden_feats)) + # output layer + self.layers.append(nn.Linear(hidden_feats, n_tasks)) + self.dropout = nn.Dropout(dropout) + self.activation = activation + + def reset_parameters(self): + # Reset the parameters of the Linear layers + for layer in self.layers: + layer.reset_parameters() + + def forward(self, left_node_feats, right_node_feats): + """Link Prediction + + Perform link prediction for P pairs of nodes. Note + that this model is symmetric and we don't have + separate parameters for the two arguments. + + Parameters + ---------- + left_node_feats : float32 tensor of shape (P, D1) + Representations for the first node in P pairs. + D1 for the number of input node features. + right_node_feats : float32 tensor of shape (P, D1) + Representations for the second node in P pairs. + D1 for the number of input node features. + + Returns + ------- + float32 tensor of shape (P, D2) + Pre-softmax/sigmoid logits, D2 equals n_tasks. + """ + pair_feats = left_node_feats * right_node_feats + for layer in self.layers[:-1]: + pair_feats = layer(pair_feats) + if self.activation is not None: + pair_feats = self.activation(pair_feats) + pair_feats = self.dropout(pair_feats) + out = self.layers[-1](pair_feats) + + return out diff --git a/tests/model/test_gnn.py b/tests/model/test_gnn.py index fa429336..ba51bbf0 100644 --- a/tests/model/test_gnn.py +++ b/tests/model/test_gnn.py @@ -322,6 +322,32 @@ def test_wln(): assert gnn(g, node_feats, edge_feats).shape == torch.Size([3, 3]) assert gnn(bg, batch_node_feats, batch_edge_feats).shape == torch.Size([8, 3]) +def test_graphsage(): + if torch.cuda.is_available(): + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + + g, node_feats = test_graph1() + g, node_feats = g.to(device), node_feats.to(device) + bg, batch_node_feats = test_graph2() + bg, batch_node_feats = bg.to(device), batch_node_feats.to(device) + + # Test default setting + gnn = GraphSAGE(in_feats=1).to(device) + gnn.reset_parameters() + assert gnn(g, node_feats).shape == torch.Size([3, 64]) + assert gnn(bg, batch_node_feats).shape == torch.Size([8, 64]) + + # Test configured setting + gnn = GraphSAGE(in_feats=1, + hidden_feats=[1, 1], + activation=[F.relu, F.relu], + dropout=[0.2, 0.2], + aggregator_type=['gcn', 'gcn']).to(device) + assert gnn(g, node_feats).shape == torch.Size([3, 1]) + assert gnn(bg, batch_node_feats).shape == torch.Size([8, 1]) + if __name__ == '__main__': test_attentivefp() test_gat() @@ -332,3 +358,4 @@ def test_wln(): test_schnet() test_weave() test_wln() + test_graphsage() diff --git a/tests/model/test_link_prediction.py b/tests/model/test_link_prediction.py index 599c324f..f11d41bb 100644 --- a/tests/model/test_link_prediction.py +++ b/tests/model/test_link_prediction.py @@ -3,72 +3,22 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import dgl import torch -import torch.nn.functional as F -from dgl import DGLGraph +from dgllife.model import HadamardLinkPredictor -from dgllife.model.model_zoo.gcn_link_predictor import * -from dgllife.model.model_zoo.sage_link_predictor import * - -def test_graph1(): - """Graph with node features.""" - g = DGLGraph([(0, 1), (0, 2), (1, 2)]) - return g, torch.arange(g.number_of_nodes()).float().reshape(-1, 1) - -def test_gcn_link_predictor(): - if torch.cuda.is_available(): - device = torch.device('cuda:0') - else: - device = torch.device('cpu') - - g, node_feats = test_graph1() - g, node_feats = g.to(device), node_feats.to(device) - - # Test configured setting - gcn_model = GCN( - in_feats=node_feats.size(-1), - n_hidden=2, - out_feats=2, - num_layers=2, - dropout=0.1).to(device) - gcn_model.train() - assert gcn_model(g, node_feats).shape == torch.Size([1, 2]) - - gcn_link_predictor = GCNLinkPredictor( - in_channels=2, - hidden_channels=2, - num_layers=2, - dropout=0.1).to(device) - assert gcn_link_predictor(g, node_feats).shape == torch.Size([1, 1]) - -def test_sage_link_predictor(): +def test_hadamard_link_predictor(): if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') - g, node_feats = test_graph1() - g, node_feats = g.to(device), node_feats.to(device) - - # Test configured setting - sage_model = SAGE( - in_feats=node_feats.size(-1), - n_hidden=2, - out_feats=2, - num_layers=2, - dropout=0.1).to(device) - sage_model.train() - assert sage_model(g, node_feats).shape == torch.Size([1, 2]) - - sage_link_predictor = SAGELinkPredictor( - in_channels=2, - hidden_channels=2, - num_layers=2, - dropout=0.1).to(device) - assert sage_link_predictor(g, node_feats).shape == torch.Size([1, 1]) + num_pairs = 4 + in_feats = 2 + model = HadamardLinkPredictor(in_feats=in_feats, hidden_feats=3, num_layers=3).to(device) + left_node_feats = torch.randn(num_pairs, in_feats).to(device) + right_node_feats = torch.randn(num_pairs, in_feats).to(device) + assert model(left_node_feats, right_node_feats).shape == torch.Size([num_pairs, 1]) if __name__ == '__main__': - test_gcn_link_predictor() - test_sage_link_predictor() + test_hadamard_link_predictor() From b86f0cfffb8d536b2931cc334de3a65c68c3573c Mon Sep 17 00:00:00 2001 From: mufeili Date: Tue, 23 Jun 2020 02:41:46 +0800 Subject: [PATCH 14/27] Update --- examples/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/README.md b/examples/README.md index c7635a47..5d3698f6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -11,6 +11,8 @@ We provide various examples across 3 applications -- property prediction, genera - [Alchemy with DGL](../python/dgllife/data/alchemy.py) - PubChem Aromaticity [[paper]](https://pubs.acs.org/doi/10.1021/acs.jmedchem.9b00959) - [PubChem Aromaticity with DGL](../python/dgllife/data/pubchem_aromaticity.py) +- OGB [[paper]](https://arxiv.org/abs/2005.00687) + - [ogbl-ppa](link_prediction/ogbl-ppa) ## Property Prediction From c3c0438e36ddd66e45642fe40d2be92eaba9e021 Mon Sep 17 00:00:00 2001 From: mufeili Date: Tue, 23 Jun 2020 11:51:48 +0800 Subject: [PATCH 15/27] Fix --- python/dgllife/model/gnn/gcn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/dgllife/model/gnn/gcn.py b/python/dgllife/model/gnn/gcn.py index 935d00eb..a434d1d6 100644 --- a/python/dgllife/model/gnn/gcn.py +++ b/python/dgllife/model/gnn/gcn.py @@ -55,8 +55,10 @@ def __init__(self, in_feats, out_feats, activation=None, def reset_parameters(self): """Reinitialize model parameters.""" self.graph_conv.reset_parameters() - self.res_connection.reset_parameters() - self.bn_layer.reset_parameters() + if self.residual: + self.res_connection.reset_parameters() + if self.bn: + self.bn_layer.reset_parameters() def forward(self, g, feats): """Update node representations. From a2bde0fb725a5b523c70e042993b6e84d4c645d5 Mon Sep 17 00:00:00 2001 From: Mufei Li Date: Tue, 23 Jun 2020 12:18:41 +0800 Subject: [PATCH 16/27] Update (#5) * Remove outdated code * Update --- examples/link_prediction/ogbl-ppa/logger.py | 5 + .../model/model_zoo/gcn_link_predictor.py | 151 ------------------ .../model/model_zoo/sage_link_predictor.py | 144 ----------------- 3 files changed, 5 insertions(+), 295 deletions(-) delete mode 100644 python/dgllife/model/model_zoo/gcn_link_predictor.py delete mode 100644 python/dgllife/model/model_zoo/sage_link_predictor.py diff --git a/examples/link_prediction/ogbl-ppa/logger.py b/examples/link_prediction/ogbl-ppa/logger.py index 17cad7f9..743ac3b8 100644 --- a/examples/link_prediction/ogbl-ppa/logger.py +++ b/examples/link_prediction/ogbl-ppa/logger.py @@ -1,3 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + import torch class Logger(object): diff --git a/python/dgllife/model/model_zoo/gcn_link_predictor.py b/python/dgllife/model/model_zoo/gcn_link_predictor.py deleted file mode 100644 index f78a4ad7..00000000 --- a/python/dgllife/model/model_zoo/gcn_link_predictor.py +++ /dev/null @@ -1,151 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Link prediction with GCN model for graphs. - - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from dgl.nn.pytorch import GraphConv - -class GCN(nn.Module): - r"""GCN from `Semi-Supervised Classification with Graph Convolutional Networks - `__ - - Parameters - ---------- - in_feats : int - Number of input node features. - n_hidden : int - Number of units in the hidden layers. - out_feats : int - Number of out node features. - num_layers : int - Number of GCN layers. - dropout : float - The probability for dropout. - By default, no dropout is performed for input layer. - """ - - def __init__(self, - in_feats, - n_hidden, - out_feats, - num_layers, - dropout): - super(GCN, self).__init__() - - self.layers = nn.ModuleList() - # input layer - self.layers.append(GraphConv(in_feats, n_hidden, activation=F.relu)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(GraphConv(n_hidden, n_hidden, activation=F.relu)) - # output layer - self.layers.append(GraphConv(n_hidden, out_feats, activation=None)) - - self.dropout = nn.Dropout(p=dropout) - - def reset_parameters(self): - # Reset the parameters of the GCN layers - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, feats): - """Update node representations. - - Parameters - ---------- - g : DGLGraph - DGLGraph for a batch of graphs - feats : FloatTensor of shape (N, M1) - * N is the total number of nodes in the batch of graphs - * M1 is the input node feature size, which equals in_feats in initialization - - Returns - ------- - feats : FloatTensor of shape (N, M2) - * N is the total number of nodes in the batch of graphs - * M2 is the output node representation size, which equals - out_feats in initialization. - """ - for i, layer in enumerate(self.layers): - if i != 0: - feats = self.dropout(feats) - feats = layer(g, feats) - return feats - -class GCNLinkPredictor(nn.Module): - """Link prediction with GCN model for graphs. - - GCN is introduced in `Semi-Supervised Classification with Graph Convolutional Networks - `__. This model is based on GCN and can be used - for link prediction on graphs. - - After updating node representations, we feed the product of the two node representations - of the predicted edge into the Linear layers for link prediction. - - Parameters - ---------- - in_channels : int - Number of channels in the input layer, which equals - the output node representation size of the GCN model. - hidden_channels : int - Number of units in the hidden layers. - num_layers : int - Number of Linear layers. - dropout : float - The probability for dropout. - By default, no dropout is performed for out layer. - """ - def __init__(self, - in_channels, - hidden_channels, - num_layers, - dropout): - super(GCNLinkPredictor, self).__init__() - - self.lins = nn.ModuleList() - # input layer - self.lins.append(nn.Linear(in_channels, hidden_channels)) - # hidden layers - for _ in range(num_layers - 2): - self.lins.append(nn.Linear(hidden_channels, hidden_channels)) - # out layer - self.lins.append(nn.Linear(hidden_channels, 1)) - - self.dropout = nn.Dropout(dropout) - - def reset_parameters(self): - # Reset the parameters of the Linear layers - for layer in self.lins: - layer.reset_parameters() - - def forward(self, x_i, x_j): - """Link prediction. - - Parameters - ---------- - x_i, x_j : FloatTensor of shape (B,M2) - * Representation of the two nodes of the predicted edge. - * B is the number of predicted edges in the batch. - * M2 is the node feature size. - - Returns - ------- - lp : FloatTensor of shape (B,1) - * The result of link prediction after sigmoid. - * B is the number of predicted edges in the batch. - """ - x = x_i * x_j - for lin in self.lins[:-1]: - x = lin(x) - x = F.relu(x) - x = self.dropout(x) - x = self.lins[-1](x) - lp = torch.sigmoid(x) - return lp diff --git a/python/dgllife/model/model_zoo/sage_link_predictor.py b/python/dgllife/model/model_zoo/sage_link_predictor.py deleted file mode 100644 index 22f0eedd..00000000 --- a/python/dgllife/model/model_zoo/sage_link_predictor.py +++ /dev/null @@ -1,144 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Link prediction with GraphSAGE model for graphs. - - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from dgl.nn.pytorch.conv import SAGEConv - -class SAGE(nn.Module): - r"""GraphSAGE from `Inductive Representation Learning on Large Graphs - `__ - - Parameters - ---------- - in_feats : int - Number of input node features. - n_hidden : int - Number of units in the hidden layers. - out_feats : int - Number of out node features. - num_layers : int - Number of SAGE layers. - dropout : float - The probability for dropout. - By default, no dropout is performed for input layer. - """ - - def __init__(self, - in_feats, - n_hidden, - out_feat, - num_layers, - dropout): - super(SAGE, self).__init__() - - self.layers = nn.ModuleList() - # input layer - self.layers.append(SAGEConv(in_feats, n_hidden, "mean", feat_drop=0., activation=F.relu)) - # hidden layers - for i in range(num_layers - 2): - self.layers.append(SAGEConv(n_hidden, n_hidden, "mean", feat_drop=dropout, activation=F.relu)) - # output layer - self.layers.append(SAGEConv(n_hidden, out_feat, "mean", feat_drop=dropout, activation=None)) # activation None - - def reset_parameters(self): - # Reset the parameters of the SAGE layers - for layer in self.layers: - layer.reset_parameters() - - def forward(self, g, feats): - """Update node representations. - - Parameters - ---------- - g : DGLGraph - DGLGraph for a batch of graphs - feats : FloatTensor of shape (N, M1) - * N is the total number of nodes in the batch of graphs - * M1 is the input node feature size, which equals in_feats in initialization - - Returns - ------- - feats : FloatTensor of shape (N, M2) - * N is the total number of nodes in the batch of graphs - * M2 is the output node representation size, which equals - out_feats in initialization. - """ - for layer in self.layers: - feats = layer(g, feats) - return feats - -class SAGELinkPredictor(nn.Module): - """Link prediction with GraphSAGE model for graphs. - - GraphSAGE is introduced in `Inductive Representation Learning on Large Graphs - `__. This model is based on GraphSAGE and can be used - for link prediction on graphs. - - After updating node representations, we feed the product of the two node representations - of the predicted edge into the Linear layers for link prediction. - - Parameters - ---------- - in_channels : int - Number of channels in the input layer, which equals - the output node representation size of the GraphSAGE model. - hidden_channels : int - Number of units in the hidden layers. - num_layers : int - Number of Linear layers. - dropout : float - The probability for dropout. - By default, no dropout is performed for out layer. - """ - def __init__(self, in_channels, hidden_channels, out_channels, num_layers, - dropout): - super(SAGELinkPredictor, self).__init__() - - self.lins = nn.ModuleList() - # input layer - self.lins.append(nn.Linear(in_channels, hidden_channels)) - # hidden layers - for _ in range(num_layers - 2): - self.lins.append(nn.Linear(hidden_channels, hidden_channels)) - # out layer - self.lins.append(nn.Linear(hidden_channels, 1)) - - self.dropout = nn.Dropout(dropout) - - def reset_parameters(self): - # Reset the parameters of the Linear layers - for layer in self.lins: - layer.reset_parameters() - - def forward(self, x_i, x_j): - """Link prediction. - - Parameters - ---------- - x_i, x_j : FloatTensor of shape (B,M2) - * Representation of the two nodes of the predicted edge. - * B is the number of predicted edges in the batch. - * M2 is the node feature size. - - Returns - ------- - lp : FloatTensor of shape (B,1) - * The result of link prediction after sigmoid. - * B is the number of predicted edges in the batch. - """ - x = x_i * x_j - for lin in self.lins[:-1]: - x = lin(x) - x = F.relu(x) - x = self.dropout(x) - x = self.lins[-1](x) - lp = torch.sigmoid(x) - return lp From 5435ffd6c77905f5f2877b460d29bdd9569163d7 Mon Sep 17 00:00:00 2001 From: Mufei Li Date: Tue, 23 Jun 2020 14:13:17 +0800 Subject: [PATCH 17/27] Fix (#6) * Remove outdated code * Update * Fix --- tests/model/test_binding_affinity.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/model/test_binding_affinity.py b/tests/model/test_binding_affinity.py index 6a8ed045..9ea7e7a8 100644 --- a/tests/model/test_binding_affinity.py +++ b/tests/model/test_binding_affinity.py @@ -50,11 +50,11 @@ def test_acnn(): model = ACNN() model.to(device) - g1.to(device) + g1 = g1.to(device) assert model(g1).shape == torch.Size([1, 1]) bg = dgl.batch_hetero([g1, g1]) - bg.to(device) + bg = bg.to(device) assert model(bg).shape == torch.Size([2, 1]) model = ACNN(hidden_sizes=[1, 2], @@ -63,11 +63,11 @@ def test_acnn(): features_to_use=torch.tensor([6., 8.]), radial=[[12.0], [0.0, 2.0], [4.0]]) model.to(device) - g1.to(device) + g1 = g1.to(device) assert model(g1).shape == torch.Size([1, 1]) bg = dgl.batch_hetero([g1, g1]) - bg.to(device) + bg = bg.to(device) assert model(bg).shape == torch.Size([2, 1]) if __name__ == '__main__': From 958d5b33d75799b05f916b7c9738cc92654259c4 Mon Sep 17 00:00:00 2001 From: Mufei Li Date: Wed, 24 Jun 2020 10:35:26 +0800 Subject: [PATCH 18/27] Try CI (#7) * Remove outdated code * Update * Fix * Hotfix (#30) * Reset parameters * Update * Update * Update * Update * Fix * Fix * Test --- .../csv_data_configuration/README.md | 3 ++- .../csv_data_configuration/classification.py | 12 ++++++++---- .../csv_data_configuration/regression.py | 12 ++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/examples/property_prediction/csv_data_configuration/README.md b/examples/property_prediction/csv_data_configuration/README.md index 22e56450..1214660a 100644 --- a/examples/property_prediction/csv_data_configuration/README.md +++ b/examples/property_prediction/csv_data_configuration/README.md @@ -33,7 +33,8 @@ we assume all columns are molecular properties except for the SMILES column. - **Split Ratio**: `-sr a,b,c` can be used to specify the proportion of the dataset to be used for training, validation and test. By default we use `0.8,0.1,0.1`. - **Evaluation Metric**: `-me metric` can be used to specify the evaluation metric. -By default we use `r2` for Pearson correlation coefficient. +By default we use `r2` for Pearson correlation coefficient. Alternatively, you can use `mae` for mean absolute error, +and `rmse` for root mean square error. - **Num Epochs**: `-n number` can be used to specify the maximum number of epochs for training. By default we set this to 1000 as early stopping will be performed based on validation metric. - **Print Every**: `-pe number` decides that the training progress will be printed every `number` minibatches. By default diff --git a/examples/property_prediction/csv_data_configuration/classification.py b/examples/property_prediction/csv_data_configuration/classification.py index 0661751b..38216c3a 100644 --- a/examples/property_prediction/csv_data_configuration/classification.py +++ b/examples/property_prediction/csv_data_configuration/classification.py @@ -105,16 +105,20 @@ def objective(hyperparams): configure = deepcopy(args) configure.update(hyperparams) configure, val_metric = main(configure, train_set, val_set, test_set) - results.append((configure, hyperparams, val_metric)) if args['metric'] in ['roc_auc_score']: - return -1 * val_metric + # Maximize ROCAUC is equivalent to minimize the negative of it + val_metric_to_minimize = -1 * val_metric else: - return val_metric + val_metric_to_minimize = val_metric + + results.append((configure, hyperparams, val_metric_to_minimize)) + + return val_metric_to_minimize fmin(objective, candidate_hypers, algo=tpe.suggest, max_evals=args['num_evals']) results.sort(key=lambda tup: tup[2]) - best_config, best_hyper, best_val_metric = results[-1] + best_config, best_hyper, best_val_metric = results[0] shutil.move(best_config['trial_path'], args['result_path'] + '/best') with open(args['result_path'] + '/best_config.txt', 'w') as f: diff --git a/examples/property_prediction/csv_data_configuration/regression.py b/examples/property_prediction/csv_data_configuration/regression.py index 962f22cb..1af14d6a 100644 --- a/examples/property_prediction/csv_data_configuration/regression.py +++ b/examples/property_prediction/csv_data_configuration/regression.py @@ -105,16 +105,20 @@ def objective(hyperparams): configure = deepcopy(args) configure.update(hyperparams) configure, val_metric = main(configure, train_set, val_set, test_set) - results.append((configure, hyperparams, val_metric)) if args['metric'] in ['r2']: - return -1 * val_metric + # Maximize R2 is equivalent to minimize the negative of it + val_metric_to_minimize = -1 * val_metric else: - return val_metric + val_metric_to_minimize = val_metric + + results.append((configure, hyperparams, val_metric_to_minimize)) + + return val_metric_to_minimize fmin(objective, candidate_hypers, algo=tpe.suggest, max_evals=args['num_evals']) results.sort(key=lambda tup: tup[2]) - best_config, best_hyper, best_val_metric = results[-1] + best_config, best_hyper, best_val_metric = results[0] shutil.move(best_config['trial_path'], args['result_path'] + '/best') with open(args['result_path'] + '/best_config.txt', 'w') as f: From b490b1210fd6138dd76845d8a420220bcff8f6b6 Mon Sep 17 00:00:00 2001 From: Mufei Li Date: Wed, 24 Jun 2020 16:25:16 +0800 Subject: [PATCH 19/27] CI (#8) * Remove outdated code * Update * Fix * Test * CI From 03eb4b118660320e347b8d4f3401c48af01e2c76 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Sat, 25 Jul 2020 19:42:18 +0800 Subject: [PATCH 20/27] Update full_graph_link_predictor.py Add node2vec as node embedding --- .../ogbl-ppa/full_graph_link_predictor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py index dc1d75d9..3f9385b4 100644 --- a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py +++ b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py @@ -129,6 +129,8 @@ def main(): help='Print training progress every {log_steps} epochs (default: 1)') parser.add_argument('--use_sage', action='store_true', help='Use GraphSAGE rather than GCN (default: False)') + parser.add_argument('--use_sage', action='store_true', + help='Prepare node embeddings using node2vec (default: 128)') parser.add_argument('--num_layers', type=int, default=3, help='Number of GNN layers to use as well as ' 'linear layers to use for final link prediction (default: 3)') @@ -160,7 +162,12 @@ def main(): data.readonly(False) data.add_edges(data.nodes(), data.nodes()) splitted_edge = dataset.get_edge_split() - x = data.ndata['feat'].float().to(device) + + if args.use_node_embedding: + x = torch.load('embedding.pt') + x = x.to(device) + else: + x = data.ndata['feat'].float().to(device) if args.use_sage: model = GraphSAGE(in_feats=x.size(-1), From 53459adcd2e966fab1d368e6404f545a624b7d28 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Sat, 25 Jul 2020 19:47:54 +0800 Subject: [PATCH 21/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 9c267810..c5822585 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -27,6 +27,7 @@ The optional arguments are as follows: ``` --use_gpu, use gpu for computation --use_sage, use GraphSAGE rather than GCN +--use_node_embedding, prepare node embeddings using node2vec --num_layers, number of GNN layers to use as well as linear layers for final link prediction (default=3) --hidden_feats, size for hidden representations (default=256) --dropout, (default=0.0) @@ -44,10 +45,11 @@ negative edges, and counting the ratio of positive edges that are ranked at 100- Using the default parameters, the performance of 10 random runs is as follows. -| Method | Train hits@100 | Validation hits@100 | Test hits@100 | -| --------- | -------------- | ------------------- | ------------- | -| GCN | 12.87 ± 5.07 | 12.39 ± 4.85 | 11.65 ± 4.56 | -| GraphSAGE | 9.58 ± 0.99 | 9.44 ± 0.96 | 9.86 ± 1.21 | +| Method | Train hits@100 | Validation hits@100 | Test hits@100 | +| ----------- | -------------- | ------------------- | ------------- | +| GCN | 23.95 ± 2.80 | 22.60 ± 2.59 | 21.30 ± 3.41 | +| GraphSAGE | 9.58 ± 0.99 | 9.44 ± 0.96 | 9.86 ± 1.21 | +| Node2vec+GCN | 27.98 ± 2.63 | 26.45 ± 2.49 | 25.81 ± 2.58 | | Method | Average Time (hour) / epoch | | --------- | --------------------------- | From a518ea261000640e2bc3d9e7f990cda5d1ded7d8 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Sat, 25 Jul 2020 19:50:47 +0800 Subject: [PATCH 22/27] Update full_graph_link_predictor.py gradient clipping --- examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py index 3f9385b4..318bd73f 100644 --- a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py +++ b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py @@ -39,6 +39,9 @@ def train(model, predictor, g, x, splitted_edge, optimizer, batch_size): loss = pos_loss + neg_loss optimizer.zero_grad() loss.backward() + #gradient clipping + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0) optimizer.step() num_samples = pos_out.size(0) From 0a0fbf0e00ac941d0f3a01e5a7a2bad36f804d38 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Sat, 25 Jul 2020 21:49:41 +0800 Subject: [PATCH 23/27] Update full_graph_link_predictor.py --- examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py index 318bd73f..44edce9e 100644 --- a/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py +++ b/examples/link_prediction/ogbl-ppa/full_graph_link_predictor.py @@ -132,7 +132,7 @@ def main(): help='Print training progress every {log_steps} epochs (default: 1)') parser.add_argument('--use_sage', action='store_true', help='Use GraphSAGE rather than GCN (default: False)') - parser.add_argument('--use_sage', action='store_true', + parser.add_argument('--use_node_embedding', action='store_true', help='Prepare node embeddings using node2vec (default: 128)') parser.add_argument('--num_layers', type=int, default=3, help='Number of GNN layers to use as well as ' From ac3c39441937527a3b70fca96c7a68ce9c1681b5 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Thu, 13 Aug 2020 13:39:31 +0800 Subject: [PATCH 24/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index 4d3c5eb5..b2d4f2df 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -48,13 +48,14 @@ Using the default parameters, the performance of 10 random runs is as follows. | Method | Train hits@100 | Validation hits@100 | Test hits@100 | | ----------- | -------------- | ------------------- | ------------- | | GCN | 23.95 ± 2.80 | 22.60 ± 2.59 | 21.30 ± 3.41 | -| GraphSAGE | 9.58 ± 0.99 | 9.44 ± 0.96 | 9.86 ± 1.21 | +| GraphSAGE | 13.88 ± 1.73 | 13.06 ± 1.51 | 11.90 ± 1.34 | | Node2vec+GCN | 27.98 ± 2.63 | 26.45 ± 2.49 | 25.81 ± 2.58 | -| Method | Average Time (hour) / epoch | -| --------- | --------------------------- | -| GCN | 1.38 | -| GraphSAGE | 1.47 | +| Method | Average Time (hour) / epoch | +| ----------- | --------------------------- | +| GCN | 1.25 | +| GraphSAGE | 1.28 | +| Node2vec+GCN | 1.29 | ## References From cf92a7eca57d3b7009f0e5cf6655ef719ba4bf9f Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Thu, 13 Aug 2020 13:59:21 +0800 Subject: [PATCH 25/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index b2d4f2df..d9fb3bde 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -38,6 +38,11 @@ The optional arguments are as follows: --runs, number of random experiments to perform (default=1) ``` + +Full-batch GCN training based on Node2Vec features. +To generate Node2Vec features, please run ```python node2vec.py```. This script requires node embeddings be saved in ```embedding.pt```. + + ## Performance For model evaluation, we consider hits@100 -- ranking each true link against 3,000,000 randomly-sampled From d39018561c04b685dbe1ab7b16034d404bda7269 Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Thu, 13 Aug 2020 14:02:18 +0800 Subject: [PATCH 26/27] Add files via upload --- examples/link_prediction/ogbl-ppa/node2vec.py | 203 ++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 examples/link_prediction/ogbl-ppa/node2vec.py diff --git a/examples/link_prediction/ogbl-ppa/node2vec.py b/examples/link_prediction/ogbl-ppa/node2vec.py new file mode 100644 index 00000000..f83bf75e --- /dev/null +++ b/examples/link_prediction/ogbl-ppa/node2vec.py @@ -0,0 +1,203 @@ +import argparse +import dgl + +import torch +from torch.nn import Embedding +from torch.utils.data import DataLoader +from torch_sparse import SparseTensor +from sklearn.linear_model import LogisticRegression + +from ogb.linkproppred import DglLinkPropPredDataset + +def save_embedding(model): + torch.save(model.embedding.weight.data.cpu(), 'embedding.pt') + +EPS = 1e-15 + + +class Node2Vec(torch.nn.Module): + r"""The Node2Vec model from the + `"node2vec: Scalable Feature Learning for Networks" + `_ paper where random walks of + length :obj:`walk_length` are sampled in a given graph, and node embeddings + are learned via negative sampling optimization. + Args: + data: The graph. + edge_index (LongTensor): The edge indices. + embedding_dim (int): The size of each embedding vector. + walk_length (int): The walk length. + context_size (int): The actual context size which is considered for + positive samples. This parameter increases the effective sampling + rate by reusing samples across different source nodes. + walks_per_node (int, optional): The number of walks to sample for each + node. (default: :obj:`1`) + p (float, optional): Likelihood of immediately revisiting a node in the + walk. (default: :obj:`1`) + q (float, optional): Control parameter to interpolate between + breadth-first strategy and depth-first strategy (default: :obj:`1`) + num_negative_samples (int, optional): The number of negative samples to + use for each positive sample. (default: :obj:`1`) + num_nodes (int, optional): The number of nodes. (default: :obj:`None`) + sparse (bool, optional): If set to :obj:`True`, gradients w.r.t. to the + weight matrix will be sparse. (default: :obj:`False`) + """ + def __init__(self, data,edge_index, embedding_dim, walk_length, context_size, + walks_per_node=1, p=1, q=1, num_negative_samples=1, + num_nodes=None, sparse=False): + super(Node2Vec, self).__init__() + + self.data = data + N = num_nodes + row, col = edge_index + self.adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N)) + self.adj = self.adj.to('cpu') + + assert walk_length >= context_size + + self.embedding_dim = embedding_dim + self.walk_length = walk_length - 1 + self.context_size = context_size + self.walks_per_node = walks_per_node + self.p = p + self.q = q + self.num_negative_samples = num_negative_samples + + self.embedding = Embedding(N, embedding_dim, sparse=sparse) + + self.reset_parameters() + + def reset_parameters(self): + self.embedding.reset_parameters() + + def forward(self, batch=None): + """Returns the embeddings for the nodes in :obj:`batch`.""" + emb = self.embedding.weight + return emb if batch is None else emb[batch] + + def loader(self, **kwargs): + return DataLoader(range(self.adj.sparse_size(0)), + collate_fn=self.sample, **kwargs) + + def pos_sample(self, batch): + batch = batch.repeat(self.walks_per_node) + seed = torch.cat([torch.LongTensor(batch)] * 1) + rw = (dgl.sampling.random_walk(dgl.graph(self.data.edges()), seed, length=self.walk_length))[0] + + walks = [] + num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size + for j in range(num_walks_per_rw): + walks.append(rw[:, j:j + self.context_size]) + + return torch.cat(walks, dim=0) + + def neg_sample(self, batch): + batch = batch.repeat(self.walks_per_node * self.num_negative_samples) + + rw = torch.randint(self.adj.sparse_size(0), + (batch.size(0), self.walk_length)) + rw = torch.cat([batch.view(-1, 1), rw], dim=-1) + + walks = [] + num_walks_per_rw = 1 + self.walk_length + 1 - self.context_size + for j in range(num_walks_per_rw): + walks.append(rw[:, j:j + self.context_size]) + return torch.cat(walks, dim=0) + + + def sample(self, batch): + if not isinstance(batch, torch.Tensor): + batch = torch.tensor(batch) + return self.pos_sample(batch), self.neg_sample(batch) + + def loss(self, pos_rw, neg_rw): + r"""Computes the loss given positive and negative random walks.""" + + # Positive loss. + start, rest = pos_rw[:, 0], pos_rw[:, 1:].contiguous() + + h_start = self.embedding(start).view(pos_rw.size(0), 1, + self.embedding_dim) + h_rest = self.embedding(rest.view(-1)).view(pos_rw.size(0), -1, + self.embedding_dim) + + out = (h_start * h_rest).sum(dim=-1).view(-1) + pos_loss = -torch.log(torch.sigmoid(out) + EPS).mean() + + # Negative loss. + start, rest = neg_rw[:, 0], neg_rw[:, 1:].contiguous() + + h_start = self.embedding(start).view(neg_rw.size(0), 1, + self.embedding_dim) + h_rest = self.embedding(rest.view(-1)).view(neg_rw.size(0), -1, + self.embedding_dim) + + out = (h_start * h_rest).sum(dim=-1).view(-1) + neg_loss = -torch.log(1 - torch.sigmoid(out) + EPS).mean() + + return pos_loss + neg_loss + + def test(self, train_z, train_y, test_z, test_y, solver='lbfgs', + multi_class='auto', *args, **kwargs): + r"""Evaluates latent space quality via a logistic regression downstream + task.""" + clf = LogisticRegression(solver=solver, multi_class=multi_class, *args, + **kwargs).fit(train_z.detach().cpu().numpy(), + train_y.detach().cpu().numpy()) + return clf.score(test_z.detach().cpu().numpy(), + test_y.detach().cpu().numpy()) + + def __repr__(self): + return '{}({}, {})'.format(self.__class__.__name__, + self.embedding.weight.size(0), + self.embedding.weight.size(1)) + +def main(): + parser = argparse.ArgumentParser(description='OGBL-PPA (Node2Vec)') + parser.add_argument('--device', type=int, default=0) + parser.add_argument('--embedding_dim', type=int, default=128) + parser.add_argument('--walk_length', type=int, default=40) + parser.add_argument('--context_size', type=int, default=20) + parser.add_argument('--walks_per_node', type=int, default=10) + parser.add_argument('--batch_size', type=int, default=256) + parser.add_argument('--lr', type=float, default=0.01) + parser.add_argument('--epochs', type=int, default=2) + parser.add_argument('--log_steps', type=int, default=1) + args = parser.parse_args() + + device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' + device = torch.device(device) + + dataset = DglLinkPropPredDataset(name='ogbl-ppa') + data = dataset[0] + edge_index=torch.stack((data.edges()[0],data.edges()[1]),dim=0) + + model = Node2Vec(data, edge_index, args.embedding_dim, args.walk_length, + args.context_size, args.walks_per_node,num_nodes=data.number_of_nodes(), + sparse=True).to(device) + + loader = model.loader(batch_size=args.batch_size, shuffle=True, + num_workers=4) + optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr) + + model.train() + for epoch in range(1, args.epochs + 1): + for i, (pos_rw, neg_rw) in enumerate(loader): + + optimizer.zero_grad() + loss = model.loss(pos_rw.to(device), neg_rw.to(device)) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + optimizer.step() + + if (i + 1) % args.log_steps == 0: + print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' + f'Loss: {loss:.4f}') + + if (i + 1) % 100 == 0: # Save model every 100 steps. + save_embedding(model) + save_embedding(model) + + +if __name__ == "__main__": + main() \ No newline at end of file From cb31eb6db18beb57c15f02b73876add39fb08f3b Mon Sep 17 00:00:00 2001 From: YueZhong <37361694+YueZhong-bio@users.noreply.github.com> Date: Thu, 13 Aug 2020 14:18:00 +0800 Subject: [PATCH 27/27] Update README.md --- examples/link_prediction/ogbl-ppa/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/link_prediction/ogbl-ppa/README.md b/examples/link_prediction/ogbl-ppa/README.md index d9fb3bde..2bce4cb5 100644 --- a/examples/link_prediction/ogbl-ppa/README.md +++ b/examples/link_prediction/ogbl-ppa/README.md @@ -42,6 +42,19 @@ The optional arguments are as follows: Full-batch GCN training based on Node2Vec features. To generate Node2Vec features, please run ```python node2vec.py```. This script requires node embeddings be saved in ```embedding.pt```. +The optional arguments are as follows: + +``` +--embedding_dim, the size of each embedding vector (default=128) +--walk_length, the walk length (default=40) +--context_size, the actual context size which is considered for positive samples (default=20) +--walks_per_node, the number of walks to sample for each node (default=10) +--batch_size, batch size to use for sampling (default=256) +--lr, learning rate (default=0.01) +--epochs, number of epochs for training (default=2) +--log_steps, number of steps log (default=1) +``` + ## Performance