# 图上 deepwalk 算法理论与实战，图算法之瑞士军刀篇(一)

### (2) 代码时光

#### (2.1) 导包

``````@ 欢迎关注作者公众号 算法全栈之路

import networkx as nx
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v1 as tf
import time
import itertools
import math
import pandas as pd
import random
from joblib import Parallel, delayed
tf.compat.v1.disable_eager_execution()

#### (2.2) 数据准备

``````@ 欢迎关注作者公众号 算法全栈之路

graph_df = pd.DataFrame([['Tom', 'pig',], ['Nancy', 'Tom'], ['Jack', 'Nancy']], columns=['src', 'dst'], index=['0', '1', '2'])
graph_df['weight']=1.0
print(graph_df)

#### （2.3） 同构图节点编码

``````
@欢迎关注微信公众号：算法全栈之路

#编码方法
def encode_map(input_array):
p_map={}
length=len(input_array)
for index, ele in zip(range(length),input_array):
# print(ele,index)
p_map[str(ele)] = index
return p_map

#解码方法
def decode_map(encode_map):
de_map={}
for k,v in encode_map.items():
# index,ele
de_map[v]=k
return de_map

print(type(graph_df['src'].values))

# 构建 encode/ decode map
node_encode_map=encode_map(set(np.append(graph_df['src'].values, graph_df['dst'].values, axis=0)))
node_decode_map=decode_map(node_encode_map)

print(len(node_encode_map))

# 应用编码
graph_df['src_node_encoded'] = graph_df['src'].apply(lambda e: node_encode_map.get(str(e),-1))
graph_df['dst_node_encoded'] = graph_df['dst'].apply(lambda e: node_encode_map.get(str(e),-1))

print(graph_df)

#### （2.4） networkx 构图

``````@欢迎关注微信公众号：算法全栈之路

G = nx.from_pandas_edgelist(graph_df, 'src_node_encoded', 'dst_node_encoded', ['weight'])
print(G)

#### （2.5） random walk 游走算法采样

``````
@欢迎关注微信公众号：算法全栈之路

def partition_num(num, workers):
if num % workers == 0:
return [num//workers]*workers
else:
return [num//workers]*workers + [num % workers]

class RandomWalker:
def __init__(self, G):
"""
:param G:
"""
self.G = G

def deepwalk_walk(self, walk_length, start_node):

walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = list(self.G.neighbors(cur))
if len(cur_nbrs) > 0:
walk.append(random.choice(cur_nbrs))
else:
break

return walk

def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
"""
:param num_walks: random walks 的次数 (每次都要遍历所有 node )
:param walk_length: 每次 random walk 最大长度
:param workers: 进程数
:param verbose:
:return:
"""
G = self.G

nodes = list(G.nodes())
# 并行分区数，
results = Parallel(n_jobs=workers, verbose=verbose, )(
delayed(self._simulate_walks)(nodes, num, walk_length) for num in
partition_num(num_walks, workers))

walks = list(itertools.chain(*results))

# 串行采样路径
# walks= self._simulate_walks(nodes,num_walks,walk_length)
print("walks_len:",len(walks))
return walks

def _simulate_walks(self, nodes, num_walks, walk_length,):
walks = []
for index in range(num_walks):
# 对每一轮
random.shuffle(nodes)
for v in nodes:
walks.append(self.deepwalk_walk(walk_length=walk_length, start_node=v))

return walks

# 随机游走算法调用
walker = RandomWalker(G)
session_reproduce = walker.simulate_walks(num_walks=6, walk_length=3, workers=2,verbose=1)

#### （2.6） skip gram 样本构造

``````
@欢迎关注微信公众号：算法全栈之路

window_size=2
all_pairs = []
session_reproduce = list(filter(lambda x: len(x) > 2, session_reproduce))

for k in range(len(session_reproduce)):
for i in range(len(session_reproduce[k])):
for j in range(i - window_size, i + window_size + 1):
if i == j or j < 0 or j >= len(session_reproduce[k]) or session_reproduce[k][i] == session_reproduce[k][j]:
continue
else:
all_pairs.append([session_reproduce[k][i], session_reproduce[k][j]])

np.savetxt('./all_pairs.csv', X=np.array(all_pairs, dtype=np.str), fmt="%s", delimiter="\t")

#### (2.7) 模型构建

``````
@欢迎关注微信公众号：算法全栈之路

# batch_size是要在单个批次中输入算法的目标和上下文单词对的数量
# embedding_size是每个单词的单词向量或嵌入的维度
# ptb.skip_window是在两个方向上的目标词的上下文中要考虑的词的数量
# n_negative_samples是由 NCE 损失函数生成的负样本数

vocabulary_size=10000
embedding_size=16
batch_size = 4
skip_window = 2
num_sampled = 2

# train_inputs中的就是中心词，train_label中的就是语料库中该中心词在滑动窗口内的上下文词。
train_inputs = tf.compat.v1.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, 1])

# embddings是词嵌入，就是要学习的词向量的存储矩阵。共有词汇表大小的行数，每一行对应一个词的向量。

embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# 注意这里用的 tf.nn.nce_loss 是用于skip gram 模型的损失。
loss = tf.reduce_mean(
tf.nn.nce_loss(
weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size)
)

#### （2.8）数据批量生成与模型训练

``````@欢迎关注微信公众号：算法全栈之路

all_pairs_df=pd.DataFrame(all_pairs, columns=["src", "dst"])
center_word=all_pairs_df["src"].tolist()
context_label=all_pairs_df["dst"].tolist()

n_epochs = 1
learning_rate = 0.1

def generate_sample(df):
pair_list =[]
for index, row in df.iterrows():
center= getattr(row,'src')
target = getattr(row,'dst')
pair_list.append([center,target])
all_pairs = np.array(pair_list)
return all_pairs

def get_batch(pair, batch_size,num_features=1):
while True:
start_idx = np.random.randint(0, len(pair) - batch_size)
batch_idx = np.array(range(start_idx, start_idx + batch_size))
batch_idx = np.random.permutation(batch_idx)  # 生成随机序列
batch = np.zeros((batch_size), dtype=np.int64)
labels = np.zeros((batch_size, 1), dtype=np.int64)
# 这里有大坑等着
batch[:] = pair[batch_idx,0]
labels[:, 0] = pair[batch_idx, 1]
yield batch, labels

pair_all = generate_sample(all_pairs_df)
batch_gen = get_batch(pair_all,BATCH_SIZE)

with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
total_loss = 0.0  # we use this to calculate the average loss in the last SKIP_STEP steps0
for index in range(NUM_TRAIN_STEPS):
centers, targets = next(batch_gen)
# print("hh:", centers.shape, "xxx:", targets.shape)
train_dict = {train_inputs: centers, train_labels: targets}
_, loss_batch = sess.run([optimizer, loss], feed_dict=train_dict)
total_loss += loss_batch
if (index + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(
index, total_loss / SKIP_STEP))
total_loss = 0.0