• DeepWalk


    来自 GraphEmbedding


    1.run

    G = nx.read_edgelist('./data/wiki/Wiki_edgelist.txt',
                             create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
    
    • 1
    • 2

    1.1.read_edgelist

    @open_file(0, mode="rb")
    def read_edgelist(
        path,
        comments="#",
        delimiter=None,
        create_using=None,
        nodetype=None,
        data=True,
        edgetype=None,
        encoding="utf-8",
    ):
            lines = (line if isinstance(line, str) else line.decode(encoding) for line in path)
        return parse_edgelist(
            lines,
            comments=comments,
            delimiter=delimiter,
            create_using=create_using,
            nodetype=nodetype,
            data=data,
        )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • open_file 是 nx 自定义的装饰器,下面是解释作用
    Normally, we'd want to use "with" to ensure that fobj gets closed.
    However, the decorator will make `path` a file object for us,
    and using "with" would undesirably close that file object.
    Instead, we use a try block, as shown above.
    When we exit the function, fobj will be closed, if it should be, by the decorator.
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 首先将文件按行为元素放到 lines 元组中,之后传入 parse_edgelist

    1.2.parse_edgelist

    def parse_edgelist(
        lines, comments="#", delimiter=None, create_using=None, nodetype=None, data=True
    ):
        from ast import literal_eval
    
        G = nx.empty_graph(0, create_using)
        for line in lines:
            p = line.find(comments)
            if p >= 0:
                line = line[:p]
            if not line:
                continue
            # split line, should have 2 or more
            s = line.strip().split(delimiter)
            if len(s) < 2:
                continue
            u = s.pop(0)
            v = s.pop(0)
            d = s
            if nodetype is not None:
                try:
                    u = nodetype(u)
                    v = nodetype(v)
                except Exception as e:
                    raise TypeError(
                        f"Failed to convert nodes {u},{v} to type {nodetype}."
                    ) from e
    
            if len(d) == 0 or data is False:
                # no data or data type specified
                edgedata = {}
            elif data is True:
                # no edge types specified
                try:  # try to evaluate as dictionary
                    if delimiter == ",":
                        edgedata_str = ",".join(d)
                    else:
                        edgedata_str = " ".join(d)
                    edgedata = dict(literal_eval(edgedata_str.strip()))
                except Exception as e:
                    raise TypeError(
                        f"Failed to convert edge data ({d}) to dictionary."
                    ) from e
            else:
                # convert edge data to dictionary with specified keys and type
                if len(d) != len(data):
                    raise IndexError(
                        f"Edge data {d} and data_keys {data} are not the same length"
                    )
                edgedata = {}
                for (edge_key, edge_type), edge_value in zip(data, d):
                    try:
                        edge_value = edge_type(edge_value)
                    except Exception as e:
                        raise TypeError(
                            f"Failed to convert {edge_key} data {edge_value} "
                            f"to type {edge_type}."
                        ) from e
                    edgedata.update({edge_key: edge_value})
            G.add_edge(u, v, **edgedata)
        return G
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 调用 empty_graph 构造空图对象,然后将每行 string 分成节点,如果每行只有两个节点就调用 add_edge 添加节点,返回图
    • 这里使用的文件没有 weight 权重数据,如果有的话会被存储在 d 中,然后转化成 edge_type 类型(这里是 int),以字典形式存储在 edgedata 中,作为参数传入 add_edge

    1.3.empty_graph

    @nodes_or_number(0)
    def empty_graph(n=0, create_using=None, default=Graph):
        if create_using is None:
            G = default()
        elif hasattr(create_using, "_adj"):
            # create_using is a NetworkX style Graph
            create_using.clear()
            G = create_using
        else:
            # try create_using as constructor
            G = create_using()
    
        n_name, nodes = n
        G.add_nodes_from(nodes)
        return G
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 这里传入的 create_using 是构造函数 nx.DiGraph(),返回创建的空图

    2.DeepWalk

    model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
    
    • 1

    2.1.RandomWalker

    class DeepWalk:
        def __init__(self, graph, walk_length, num_walks, workers=1):
            self.graph = graph
            self.w2v_model = None
            self._embeddings = {}
    
            self.walker = RandomWalker(
                graph, p=1, q=1,)
            self.sentences = self.walker.simulate_walks(
                num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    class RandomWalker:
        def __init__(self, G, p=1, q=1, use_rejection_sampling=False):
            """
            :param G:
            :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
            :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
            :param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
            """
            self.G = G
            self.p = p
            self.q = q
            self.use_rejection_sampling = use_rejection_sampling
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • RandomWalker 是为 node2vec 实现的类,当 p,q 都为1时变成 deepwalk

    2.2.simulate_walks

    def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
    
        G = self.G
    
        nodes = list(G.nodes())
    
        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length) for num in
            partition_num(num_walks, workers))
    
        walks = list(itertools.chain(*results))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 实现并行
    def _simulate_walks(self, nodes, num_walks, walk_length, ):
            walks = []
            for _ in range(num_walks):
                random.shuffle(nodes)
                for v in nodes:
                    if self.p == 1 and self.q == 1:
                        walks.append(self.deepwalk_walk(
                            walk_length=walk_length, start_node=v))
                    elif self.use_rejection_sampling:
                        walks.append(self.node2vec_walk2(
                            walk_length=walk_length, start_node=v))
                    else:
                        walks.append(self.node2vec_walk(
                            walk_length=walk_length, start_node=v))
            return walks
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 以每个节点为起点 walk_length 长度的序列,循环 num_walks 次

    2.3.deepwalk_walk

    def deepwalk_walk(self, walk_length, start_node):
        walk = [start_node]
    
        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(self.G.neighbors(cur))
            if len(cur_nbrs) > 0:
                walk.append(random.choice(cur_nbrs))
            else:
                break
        return walk
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 产生一条随机游走序列

    3.train

    model.train(window_size=5, iter=3,embed_size=128)
    def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
        
        #这里参数size和iter
        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["vector_size"] = embed_size
        kwargs["sg"] = 1  # skip gram
        kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["epochs"] = iter
    
        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vectors done!")
    
        self.w2v_model = model
        return model
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 构造word2vec模型返回

    4.get_embeddings

    embeddings = model.get_embeddings()
    def get_embeddings(self, ):
        if self.w2v_model is None:
            print("model not train")
            return {}
    
        self._embeddings = {}
        for word in self.graph.nodes():
            self._embeddings[word] = self.w2v_model.wv[word]
    
        return self._embeddings
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    Word2Vec.wv以word为索引拿到vec

    5.evaluate_embeddings

    evaluate_embeddings(embeddings)
    def evaluate_embeddings(embeddings):
        X, Y = read_node_label('./data/wiki/wiki_labels.txt')
        tr_frac = 0.8
        print("Training classifier using {:.2f}% nodes...".format(
            tr_frac * 100))
        clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
        clf.split_train_evaluate(X, Y, tr_frac)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    5.1.read_node_label

    def read_node_label(filename, skip_head=False):
        fin = open(filename, 'r')
        X = []
        Y = []
        while 1:
            if skip_head:
                fin.readline()
            l = fin.readline()
            if l == '':
                break
            vec = l.strip().split(' ')
            X.append(vec[0])
            Y.append(vec[1:])
        fin.close()
        return X, Y
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 读取节点编号 x 和节点的label y,存入向量X,Y

    5.2.Classifier

    class Classifier(object):
    
        def __init__(self, embeddings, clf):
            self.embeddings = embeddings
            self.clf = TopKRanker(clf)
            self.binarizer = MultiLabelBinarizer(sparse_output=True)
    
        def train(self, X, Y, Y_all):
            self.binarizer.fit(Y_all)
            X_train = [self.embeddings[x] for x in X]
            Y = self.binarizer.transform(Y)
            self.clf.fit(X_train, Y)
    
        def evaluate(self, X, Y):
            top_k_list = [len(l) for l in Y]
            Y_ = self.predict(X, top_k_list)
            Y = self.binarizer.transform(Y)
            averages = ["micro", "macro", "samples", "weighted"]
            results = {}
            for average in averages:
                results[average] = f1_score(Y, Y_, average=average)
            results['acc'] = accuracy_score(Y, Y_)
            print('-------------------')
            print(results)
            return results
    
        def predict(self, X, top_k_list):
            X_ = numpy.asarray([self.embeddings[x] for x in X])
            Y = self.clf.predict(X_, top_k_list=top_k_list)
            return Y
    
        def split_train_evaluate(self, X, Y, train_precent, seed=0):
            state = numpy.random.get_state()
    
            training_size = int(train_precent * len(X))
            numpy.random.seed(seed)
            shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
            X_train = [X[shuffle_indices[i]] for i in range(training_size)]
            Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
            X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
            Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
    
            self.train(X_train, Y_train, Y)
            numpy.random.set_state(state)
            return self.evaluate(X_test, Y_test)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45

    5.3.self.train

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)
    
    • 1
    • 2
    • 3
    • 4
    • 5

    MultiLabelBinarizer把标签转化为len(classType)长度的label向量,只有 y 类的位置为1,其他地方为0

    5.3.1.TopKRanker

    class TopKRanker(OneVsRestClassifier):
        def predict(self, X, top_k_list):
            probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
            all_labels = []
            for i, k in enumerate(top_k_list):
                probs_ = probs[i, :]
                labels = self.classes_[probs_.argsort()[-k:]].tolist()
                probs_[:] = 0
                probs_[labels] = 1
                all_labels.append(probs_)
            return numpy.asarray(all_labels)
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 以 LogisticRegression 为 estimator 构建 OneVsRestClassifier 模型,再由 TopKRanker 类继承,自定义 predict 的实现

    5.4.self.evaluate

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        results['acc'] = accuracy_score(Y, Y_)
        print('-------------------')
        print(results)
        return results
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    6.plot_embeddings

    def plot_embeddings(embeddings,):
        X, Y = read_node_label('./data/wiki/wiki_labels.txt')
    
        emb_list = []
        for k in X:
            emb_list.append(embeddings[k])
        emb_list = np.array(emb_list)
    
        model = TSNE(n_components=2)
        node_pos = model.fit_transform(emb_list)
    
        color_idx = {}
        for i in range(len(X)):
            color_idx.setdefault(Y[i][0], [])
            color_idx[Y[i][0]].append(i)
    
        for c, idx in color_idx.items():
            plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
        plt.legend()
        plt.show()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
  • 相关阅读:
    2021 XV6 5:Copy-on-Write Fork
    【数据结构】12.排序
    【动画进阶】有意思的 Emoji 3D 表情切换效果
    RocketMQ 4.5.1安装教程
    如何看待著名游戏引擎 Unity 宣布将更改收费模式,收取「运行时费用」?这将造成哪些影响?
    c++_learning-并发与多线程
    代码随想录刷题】Day15 二叉树02------延伸题目练习
    卷积神经网络(1)
    货币银行学简答论述题
    要点初见:切换老版本Rust并运行老版本cargo
  • 原文地址:https://blog.csdn.net/weixin_52812620/article/details/126431584