目录
1、PyG中数据集的属性查询(Question 1,类别数目和特征数目 Question 2,3)
2、OGB包的使用(Question 4:How many features are in the ogbn-arxiv graph?)
① torch.nn.ModuleList()类、torch.nn.LogSoftmax()类的使用
② PyG中GCNConv()类、BatchNorm()类的使用
④ torch.flatten(input, start_dim=0, end_dim=-1):去掉start~end的维度,把结果包装成一个“一维Tensor”并返回。
⑤ train函数的理解,def train(model, data, train_idx, optimizer, loss_fn):
⑦ test(...)方法中的out.argmax(......)函数使用:如代码注释
- # Question 1:What is the number of classes and number of features in the ENZYMES dataset?
- def get_num_classes(pyg_dataset):
- # TODO: Implement a function that takes a PyG dataset object
- # and returns the number of classes for that dataset.
-
- num_classes = 0
-
- ############# Your code here ############
- ## (~1 line of code)
- ## Note
- ## 1. Colab autocomplete functionality might be useful.
-
- #########################################
- num_classes = pyg_dataset.num_classes
- return num_classes
-
- def get_num_features(pyg_dataset):
- # TODO: Implement a function that takes a PyG dataset object
- # and returns the number of features for that dataset.
-
- num_features = 0
-
- ############# Your code here ############
- ## (~1 line of code)
- ## Note
- ## 1. Colab autocomplete functionality might be useful.
-
- #########################################
- num_features = pyg_dataset.num_features
- return num_features
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- num_classes = get_num_classes(pyg_dataset)
- num_features = get_num_features(pyg_dataset)
- print("{} dataset has {} classes".format(name, num_classes))
- print("{} dataset has {} features".format(name, num_features))
- # Question 2:What is the label of the graph with index 100 in the ENZYMES dataset?
- def get_graph_class(pyg_dataset, idx):
- # TODO: Implement a function that takes a PyG dataset object,
- # an index of a graph within the dataset, and returns the class/label
- # of the graph (as an integer).
-
- label = -1
-
- ############# Your code here ############
- ## (~1 line of code)
-
- #########################################
- label = pyg_dataset[idx].y.item()
- return label
-
- # Here pyg_dataset is a dataset for graph classification
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- graph_0 = pyg_dataset[0]
- print(graph_0)
- idx = 100
- label = get_graph_class(pyg_dataset, idx)
- print('Graph with index {} has label {}'.format(idx, label))
- # Question 3:How many edges does the graph with index 200 have?
-
- print(pyg_dataset[200].num_edges)
- def get_graph_num_edges(pyg_dataset, idx):
- # TODO: Implement a function that takes a PyG dataset object,
- # the index of a graph in the dataset, and returns the number of
- # edges in the graph (as an integer). You should not count an edge
- # twice if the graph is undirected. For example, in an undirected
- # graph G, if two nodes v and u are connected by an edge, this edge
- # should only be counted once.
-
- num_edges = 0
-
- ############# Your code here ############
- ## Note:
- ## 1. You can't return the data.num_edges directly
- ## 2. We assume the graph is undirected
- ## 3. Look at the PyG dataset built in functions
- ## (~4 lines of code)
-
- #########################################
- num_edges = pyg_dataset[idx].num_edges / 2
- return num_edges
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- idx = 200
- num_edges = get_graph_num_edges(pyg_dataset, idx)
- print('Graph with index {} has {} edges'.format(idx, num_edges))
- #Question 4
- import torch
- import pandas as pd
- import os
- print("aa1")
- import torch_geometric.transforms as T
- print("aa2")
- from ogb.nodeproppred import PygNodePropPredDataset
- print("aa3")
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- dataset_name = 'ogbn-arxiv'
- print("bb")
- # Load the dataset and transform it to sparse tensor
- dataset = PygNodePropPredDataset(name=dataset_name,
- transform=T.ToSparseTensor(), root="./Arxiv")
- print('The {} dataset has {} graph'.format(dataset_name, len(dataset)))
-
- # Extract the graph
- data = dataset[0]
- print(data)
-
- #Question 4
- def graph_num_features(data):
- # TODO: Implement a function that takes a PyG data object,
- # and returns the number of features in the graph (as an integer).
-
- num_features = 0
-
- ############# Your code here ############
- ## (~1 line of code)
- num_features = data.num_features
- #########################################
-
- return num_features
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- num_features = graph_num_features(data)
- print('The graph has {} features'.format(num_features))
__init__(...)方法中定义上图中的操作,foward(...)方法负责调用这些操作,将图中的数据X、adj等最终映射为一个N维向量(N:节点数)。
举例:一个3维矩阵,压缩0~1维,最终返回一个“1维向量”。
- x = np.arange(27)
- x = np.reshape(x, (3, 3, 3))
- x = torch.from_numpy(x)
- print('before flatten', x)
- x = torch.flatten(x, start_dim=0, end_dim=1)
- print('after flatten', x)
-
- 输出如下图:
结果分析:
首先,去掉一个维度后,变量数目是变多的,例如变量为一个3*3的二维矩阵,去掉第0维后,变成3个有三个元素的向量,1到3,增多。
回到本例,压缩/去除第0维,x[0]、x[1]和x[2]之间的联系取消,即最外边的中括号去掉,压缩第1维同理。得到的结果为[1,2,3], [4,5,6]...一个个“变量”,最终把这一个个变量封装成一个“一维向量”便是下图结果所示,实际上是2维的。
作用:完成一个epoch的训练(即所有数据走一遍)。
过程:model.train()函数(修改self.training变量的值,使得drop等操作有选择的执行,drop函数定义过程如下所示,需要用到self.training变量)——“优化器”梯度清零——data喂到model里,得到输出结果out——利用loss函数和out以及data标签计算损失——反向传播调整模型参数。
第二个参数需要为一维向量,不能为二维,内部函数处理时会自动将其上升一个维度,因此要使用torch.flatten(...)函数来降维。
- class GCN(torch.nn.Module):
- def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
- dropout, return_embeds=False):
- # TODO: Implement a function that initializes self.convs,
- # self.bns, and self.softmax.
-
- super(GCN, self).__init__()
-
- # A list of GCNConv layers
- self.convs = None
-
- # A list of 1D batch normalization layers
- self.bns = None
-
- # The log softmax layer
- self.softmax = None
-
- ############# Your code here ############
- ## Note:
- ## 1. You should use torch.nn.ModuleList for self.convs and self.bns
- ## 2. self.convs has num_layers GCNConv layers
- ## 3. self.bns has num_layers - 1 BatchNorm1d layers
- ## 4. You should use torch.nn.LogSoftmax for self.softmax
- ## 5. The parameters you can set for GCNConv include 'in_channels' and
- ## 'out_channels'. For more information please refer to the documentation:
- ## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GCNConv
- ## 6. The only parameter you need to set for BatchNorm1d is 'num_features'
- ## For more information please refer to the documentation:
- ## https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html
- ## (~10 lines of code)
-
- #########################################
- self.convs = torch.nn.ModuleList()
- self.bns = torch.nn.ModuleList()
- self.softmax = torch.nn.LogSoftmax()
- tmp1 = input_dim
- tmp2 = hidden_dim
- for i in range(num_layers - 1):
- self.convs.append(GCNConv(tmp1, tmp2))
- self.bns.append(BatchNorm(tmp2))
- tmp1 = tmp2
- self.convs.append(GCNConv(hidden_dim, output_dim))
-
- # Probability of an element getting zeroed
- self.dropout = dropout
-
- # Skip classification layer and return node embeddings
- self.return_embeds = return_embeds
-
- def reset_parameters(self):
- for conv in self.convs:
- conv.reset_parameters()
- for bn in self.bns:
- bn.reset_parameters()
-
- def forward(self, x, adj_t):
- # TODO: Implement a function that takes the feature tensor x and
- # edge_index tensor adj_t and returns the output tensor as
- # shown in the figure.
-
- out = None
-
- ############# Your code here ############
- ## Note:
- ## 1. Construct the network as shown in the figure
- ## 2. torch.nn.functional.relu and torch.nn.functional.dropout are useful
- ## For more information please refer to the documentation:
- ## https://pytorch.org/docs/stable/nn.functional.html
- ## 3. Don't forget to set F.dropout training to self.training
- ## 4. If return_embeds is True, then skip the last softmax layer
- ## (~7 lines of code)
-
- #########################################
- out = x
- for i in range(len(self.bns)):
- out = self.convs[i](out, adj_t)
- out = self.bns[i](out)
- out = torch.nn.functional.relu(out)
- out = torch.nn.functional.dropout(out, p=self.dropout, training=self.training)
-
- out = self.convs[-1](out, adj_t)
- out = self.softmax(out) if (not self.return_embeds) else out
- return out
-
-
- def train(model, data, train_idx, optimizer, loss_fn):
- # TODO: Implement a function that trains the model by
- # using the given optimizer and loss_fn.
- model.train()
- loss = 0
-
- ############# Your code here ############
- ## Note:
- ## 1. Zero grad the optimizer
- ## 2. Feed the data into the model
- ## 3. Slice the model output and label by train_idx
- ## 4. Feed the sliced output and label to loss_fn
- ## (~4 lines of code)
-
- optimizer.zero_grad()
- out = model(data.x, data.adj_t)
-
- loss = loss_fn(out[train_idx], torch.flatten(data.y[train_idx]))
- #########################################
-
- loss.backward()
- optimizer.step()
-
- return loss.item()
-
-
- # Test function here
- @torch.no_grad()
- def test(model, data, split_idx, evaluator, save_model_results=False):
- # TODO: Implement a function that tests the model by
- # using the given split_idx and evaluator.
- model.eval()
-
- # The output of model on all data
- out = None
-
- ############# Your code here ############
- ## (~1 line of code)
- ## Note:
- ## 1. No index slicing here
-
- #########################################
- out = model(data.x, data.adj_t)
-
- # wjunjie
- # argmax在指定维度dim上操作,选择最大值的dim维度上的索引,keepdim=True相当于保留原始维度
- # out是一个n*m的矩阵,m等于类别数目.
- # 这里最后要得到一个n*1个pred矩阵,所以在dim=1上操作argmax(a[0][0],...,a[0][m-1])
- y_pred = out.argmax(dim=-1, keepdim=True)
-
- train_acc = evaluator.eval({
- 'y_true': data.y[split_idx['train']],
- 'y_pred': y_pred[split_idx['train']],
- })['acc']
- valid_acc = evaluator.eval({
- 'y_true': data.y[split_idx['valid']],
- 'y_pred': y_pred[split_idx['valid']],
- })['acc']
- test_acc = evaluator.eval({
- 'y_true': data.y[split_idx['test']],
- 'y_pred': y_pred[split_idx['test']],
- })['acc']
-
- if save_model_results:
- print("Saving Model Predictions")
-
- data = {}
- # 二维y_pred转为一维、cuda数据转为cpu、去掉tensor的梯度、转为numpy
- data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()
-
- df = pd.DataFrame(data=data)
- # Save locally as csv
- df.to_csv('ogbn-arxiv_node.csv', sep=',', index=False)
-
- return train_acc, valid_acc, test_acc
-
-
- # Please do not change the args
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- args = {
- 'device': device,
- 'num_layers': 3,
- 'hidden_dim': 256,
- 'dropout': 0.5,
- 'lr': 0.01,
- 'epochs': 100,
- }
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- model = GCN(data.num_features, args['hidden_dim'],
- dataset.num_classes, args['num_layers'],
- args['dropout']).to(device)
- evaluator = Evaluator(name='ogbn-arxiv')
-
-
- import copy
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- # reset the parameters to initial random value
- model.reset_parameters()
-
- optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
- loss_fn = F.nll_loss
-
- best_model = None
- best_valid_acc = 0
-
- for epoch in range(1, 1 + args["epochs"]):
- loss = train(model, data, train_idx, optimizer, loss_fn)
- result = test(model, data, split_idx, evaluator)
- train_acc, valid_acc, test_acc = result
- if valid_acc > best_valid_acc:
- best_valid_acc = valid_acc
- best_model = copy.deepcopy(model)
- print(f'Epoch: {epoch:02d}, '
- f'Loss: {loss:.4f}, '
- f'Train: {100 * train_acc:.2f}%, '
- f'Valid: {100 * valid_acc:.2f}% '
- f'Test: {100 * test_acc:.2f}%')
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- best_result = test(best_model, data, split_idx, evaluator, save_model_results=True)
- train_acc, valid_acc, test_acc = best_result
- print(f'Best model: '
- f'Train: {100 * train_acc:.2f}%, '
- f'Valid: {100 * valid_acc:.2f}% '
- f'Test: {100 * test_acc:.2f}%')
一个图级别卷积过程包括:先利用节点GNN得到节点的表征——然后池化——最后进行一些适当的线性or非线性变化。
因此init函数和foward函数的coding过程就按照上述思路来。
教程将obg的datset先转换为了DataLoader类型,并设定了batch。难道DataLoader是使用batch的一种方式?
A、Batch:一个batch相当于一个小的dataset,里面包含了“32”(batch size)个图的信息。
B、python list解析式:其他语法糖还有dict解析式等...
C、tensor的索引可以是list和LongTensor,还可以bool数组(如train函数中注释所示)。
- ### GCN to predict graph property
- class GCN_Graph(torch.nn.Module):
- def __init__(self, hidden_dim, output_dim, num_layers, dropout):
- super(GCN_Graph, self).__init__()
-
- # Load encoders for Atoms in molecule graphs
- self.node_encoder = AtomEncoder(hidden_dim)
-
- # Node embedding model
- # Note that the input_dim and output_dim are set to hidden_dim
- self.gnn_node = GCN(hidden_dim, hidden_dim,
- hidden_dim, num_layers, dropout, return_embeds=True)
-
- self.pool = None
-
- ############# Your code here ############
-
- self.pool = global_mean_pool
- ## Note:
- ## 1. Initialize self.pool as a global mean pooling layer
- ## For more information please refer to the documentation:
- ## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#global-pooling-layers
-
- #########################################
-
- # Output layer
- self.linear = torch.nn.Linear(hidden_dim, output_dim)
-
- def reset_parameters(self):
- self.gnn_node.reset_parameters()
- self.linear.reset_parameters()
-
- def forward(self, batched_data):
- # TODO: Implement a function that takes as input a
- # mini-batch of graphs (torch_geometric.data.Batch) and
- # returns the predicted graph property for each graph.
- #
- # NOTE: Since we are predicting graph level properties,
- # your output will be a tensor with dimension equaling
- # the number of graphs in the mini-batch
-
- # Extract important attributes of our mini-batch
- x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch
- embed = self.node_encoder(x)
-
- out = None
-
- ############# Your code here ############
- ## Note:
- ## 1. Construct node embeddings using existing GCN model
- ## 2. Use the global pooling layer to aggregate features for each individual graph
- ## For more information please refer to the documentation:
- ## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#global-pooling-layers
- ## 3. Use a linear layer to predict each graph's property
- ## (~3 lines of code)
- out = self.gnn_node(embed, edge_index)
- out = self.pool(out, batch)
- out = self.linear(out)
- #########################################
-
- return out
-
-
- def train(model, device, data_loader, optimizer, loss_fn):
-
- # TODO: Implement a function that trains your model by
- # using the given optimizer and loss_fn.
- model.train()
- loss = 0
-
- for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
- batch = batch.to(device)
- # 一个batch相当于一个小的dataset,里面包含了“32”(batch size)个图的信息
- if batch.x.shape[0] == 1 or batch.batch[-1] == 0: # 这个batch是孤立节点或者该batch里只有一个图
- pass
- else:
- ## ignore nan targets (unlabeled) when computing training loss.
- is_labeled = batch.y == batch.y # 不出意外全是true 【32,1】
-
- ############# Your code here ############
- ## Note:
- ## 1. Zero grad the optimizer
- ## 2. Feed the data into the model
- ## 3. Use `is_labeled` mask to filter output and labels
- ## 4. You may need to change the type of label to torch.float32
- ## 5. Feed the output and label to the loss_fn
- ## (~3 lines of code)
- optimizer.zero_grad()
- out = model(batch)
-
- # python list解析式,其他语法糖还有dict解析式等...
- tmp_index = [index for index in range(is_labeled.shape[0]) if is_labeled[index]]
- loss = loss_fn(out[tmp_index], batch.y[tmp_index].type(torch.float32))
-
- # 看了其他人的,这里index直接用is_labeled代替就可以,所以tensor的索引可以是list和LongTensor,还可以bool数组。
-
- #########################################
-
- loss.backward()
- optimizer.step()
-
- return loss.item()
-
-
- # The evaluation function
- def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
- model.eval()
- y_true = []
- y_pred = []
-
- for step, batch in enumerate(tqdm(loader, desc="Iteration")):
- batch = batch.to(device)
-
- if batch.x.shape[0] == 1:
- pass
- else:
- with torch.no_grad():
- pred = model(batch)
-
- y_true.append(batch.y.view(pred.shape).detach().cpu())
- y_pred.append(pred.detach().cpu())
-
- y_true = torch.cat(y_true, dim=0).numpy()
- y_pred = torch.cat(y_pred, dim=0).numpy()
-
- input_dict = {"y_true": y_true, "y_pred": y_pred}
-
- if save_model_results:
- print("Saving Model Predictions")
-
- # Create a pandas dataframe with a two columns
- # y_pred | y_true
- data = {}
- data['y_pred'] = y_pred.reshape(-1)
- data['y_true'] = y_true.reshape(-1)
-
- df = pd.DataFrame(data=data)
- # Save to csv
- df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)
-
- return evaluator.eval(input_dict)
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- model = GCN_Graph(args['hidden_dim'],
- dataset.num_tasks, args['num_layers'],
- args['dropout']).to(device)
- evaluator = Evaluator(name='ogbg-molhiv')
-
- import copy
-
- if 'IS_GRADESCOPE_ENV' not in os.environ:
- model.reset_parameters()
-
- optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
- loss_fn = torch.nn.BCEWithLogitsLoss()
-
- best_model = None
- best_valid_acc = 0
-
- for epoch in range(1, 1 + args["epochs"]):
- print('Training...')
- loss = train(model, device, train_loader, optimizer, loss_fn)
-
- print('Evaluating...')
- train_result = eval(model, device, train_loader, evaluator)
- val_result = eval(model, device, valid_loader, evaluator)
- test_result = eval(model, device, test_loader, evaluator)
-
- train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
- if valid_acc > best_valid_acc:
- best_valid_acc = valid_acc
- best_model = copy.deepcopy(model)
- print(f'Epoch: {epoch:02d}, '
- f'Loss: {loss:.4f}, '
- f'Train: {100 * train_acc:.2f}%, '
- f'Valid: {100 * valid_acc:.2f}% '
- f'Test: {100 * test_acc:.2f}%')
End...