word2vec

雪柳花明 2017-07-05
展开全文
# -*- coding:utf-8 -*-
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
import collections

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


url='http:///dc/'

#下载数据
def maybe_download(filename,expected_bytes):
    #文件是否存在
    if not os.path.exists(filename):
        #urlretrieve() 方法直接将远程数据下载到本地。
        filename,_=urllib.urlretrieve(url+filename,filename)

    #获取文件的状态
    statinfo=os.stat(filename)
    if statinfo.st_size==expected_bytes:
        print ('found and verified',filename)
    else:
        print (statinfo.st_size)
        raise Exception(
            'failed to verify '+filename+'. can you get to it with a browser?'
        )
    return filename

#下载数据文件
filename=maybe_download('text8.zip',31344016)

#读取zip文件的内容
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data=tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data


#words  list
words=read_data(filename)
print (type(words))
print ('Data size',len(words))
#1000多万个单词


#单词库  5万
vocabulary_size=50000

def build_dataset(words):
    count=[['UNK',-1]]#list  此时，len(count)=1,表示只有一组数据  词汇表vocabulary

    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    #这句话，将words中，最常见的49999个单词和对应的个数，放入count中。
    ##此时，len(count)=50000,表示只有50000组数据
    #使用collections.Counter统计word单词列表中单词的频数，然后使用most_common方法取
    #top 50000频数的单词作为词汇表vocabulary

    dictionary=dict()#创建一个字典
    #将全部单词转为编号（以频数排序的编号），top50000之外的单词，认为UnKown,编号为0,并统计这类词汇的数量

    for word,_ in count:
        dictionary[word]=len(dictionary)#将全部单词转为编号（以频数排序的编号）
        #count中排序前五个为：the 1061396  of 593677 and 416629 one 411764 in 372201,......
        #那么在dictionary中，dictionary['the']=1   dictionary['of']=2
        #dictionary['and']=3   dictionary['one']=4,............
        #

    data=list()
    unk_count=0
    for word in words:#遍历单词列表，
        #对于其中每一个单词，先判断是否出现在dictionary中，
        if word in dictionary:
            #如果出现，则转为其编号
            index=dictionary[word]
        else:#如果不是，则转为编号0
            index=0
            unk_count+=1
        data.append(index)
        #data是dictionary的数值 逆序  （频数）
        #reverse_dictionary是dictionary的整个逆序
        # 通过data索引获取的值，便可以通过reverse_dictionary查看单词

    count[0][1]=unk_count
    #count[0]:表示第一个的键，


    reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))

    #最后返回转换后的编码data,通过data[索引]，就可以得到一个数值，通过该数值，在reverse_dictionary就能知道该数值代表的单词
    #例如：data[1]=3084   reverse_dictionary[3084]='originated'
    # 每个单词的频数统计count,词汇表，词汇表反转的形式
    return data,count,dictionary,reverse_dictionary
#dictionary和reverse_dictionary是互为对应的。
#dictionary  键‘字符串’    值 整数数值 int类型
#reverse_dictionary  键 整数数值 int类型   值‘字符串’

#将从text解压出来的1000多万个单词，进行数据处理
#删除原始单词列表，以节约内存。
data,count,dictionary,reverse_dictionary=build_dataset(words)


del words
print ('most common words (+UNK)',count[:5])#打印词汇表count，中，出现频率最高的5个。
print ('sample data',data[:10],[reverse_dictionary[i] for i in data[:10]])
#编码data中前10个单词----编号----


data_index=0
#生成word2vec的训练用的batch数据，参数中batch_size为batch的大小;
#skip_window指单词最远可以联系的距离，设为1表示只能跟紧邻的两个单词生成样本。
#num_skips为对每个单词生成多少个样本，不能大于skip_windows的两倍，并且batch_size是其整数倍
def generate_batch(batch_size,num_skips,skip_window):

    global data_index #数据索引，全局变量
    #确保batch_size可以整除num_skips
    assert batch_size%num_skips==0
    assert num_skips<=2*skip_window#
    #python assert断言是声明其布尔值必须为真的判定，如果发生异常就说明表达示为假。
    # 可以理解assert断言语句为raise-if-not，用来测试表示式，其返回值为假，就会触发异常。

    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    #创建一个batch_size大小的数组，数据类型为int32类型,数值随机， 从0到2的32次方
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)#数据维度：[batch_size,1]

    span = 2 * skip_window + 1  # [ skip_window target skip_window ]   跨度
    #span为对某个单词创建相关样本时，会是严格哦你到的单词数量。包括目标单词本身和它前后的单词

    buffer = collections.deque(maxlen=span)  # buffer用来存取w上下文word的id
    #创建最大容量为span的deque,双向队列。

    for _ in range(span):
        buffer.append(data[data_index])  # data:ids 在对双向队列deque使用append方法添加变量时，只会保留最后插入的span个变量
        data_index = (data_index + 1) % len(data)
        #从序号data_index开始，把span个单词顺序读入buffer作为初始值。因为容量为span的deque，所以此时buffer已经填充满
        #后续数据会替换掉前面的数据。

    #进入到第以层循环，次数为  batch_size // num_skips
    for i in range(batch_size // num_skips):  # how many num_skips in a batch
        #每次循环内，对一个目标单词生成样本。

        #现在bufffer中目是标单词和所有相关单词。定义target = skip_window ，
        #即buffer中第skip_window个变量为目标单词
        #然后定义生成样本时需要避免的单词列表targets_to_avoid，该列表一开始就包括第skip_window个单词（目标单词）
        #因为要预测的是语境单词，不包括单词本身。

        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]  # extract the middle word

        #第二层循环
        for j in range(num_skips):
            #每次循环中对一个语境单词单词生成样本。#先产生随机数，直到随机数不再targets_to_avoid中。
            #代表可以使用的语境单词，然后产生一个样本。
            while target in targets_to_avoid:  #直到随机数不再targets_to_avoid中。 context中的word，一个只取一次
                target = random.randint(0, span - 1)#先产生随机数，

            targets_to_avoid.append(target)  ##代表可以使用的语境单词，然后产生一个样本。

            batch[i * num_skips + j] = buffer[skip_window] #buffer[skip_window]目标词汇  feature
            labels[i * num_skips + j, 0] = buffer[target]  # label 标签：buffer[target]
            #同时，因为这个语境单词被使用了，所以在把它添加到targets_to_avoid中过滤

        #在对一个目标单词生成完所有的样本之后(num_skip个样本)，我们在读下一个单词，同时会抛掉buffer中的第一个单词,即
        #把滑窗向后移以为，这样目标单词也向后移动以为，语境单词也整体后移了，便可以生成下一个目标单词的训练样本。
        buffer.append(data[data_index])  # update the buffer, append the next word to buffer
        data_index = (data_index + 1) % len(data)

    #两层循环完成之后，就已经获得batch_size个训练样本，将batch和labels作为函数结果返回。
    return batch, labels  # batch: ids [batch_size] lebels:ids [batch_size*1]


#调用generate_batch简单测试功能。
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]],'->', labels[i, 0], reverse_dictionary[labels[i, 0]])



#在本代码中，训练时，batch_size代表的是一个batch中，word的个数，而不是sentense的个数。
batch_size = 128
embedding_size = 128  # 单词转为稠密向量的维度，这里使用128作为词向量的维度
skip_window = 1   # How many words to consider left and right.单词间最远可以联系的距离
num_skips = 2   # 每个目标单词提取的样本数How many times to reuse an input(buffer) to generate a label.

#验证数据 valid_examples，这里随机抽取一些频数最高的单词，看向量空间上跟他们最近的单词是否相关性比较高
valid_size = 16     # 抽取的验证单词数
valid_window = 100  # 指验证单词只从频数最高的100个单词中抽取
valid_examples = np.random.choice(valid_window, valid_size, replace=False)#使用np.random.choice函数进行随机抽取
num_sampled = 64    # 训练是用作负样本的噪声单词的数量


#
graph = tf.Graph()
with graph.as_default():

  # 创建训练数据中的inputs和balels的placeholder
  #在这里，我们只输入word对应的id，假设batch_size是128,那么我们第一次就输入文本前128个word所对应的id
  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
  #labels和inputs是一样的， 只不过一个是行向量（tensor），一个是列向量（tensor）
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)#
  #将前面随机产生的valid_examples转换为Tensorflow中的constant

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    #使用tf.random_uniform随机生成所有单词的词向量embeddings
    #单词表大小为50000,向量维度128
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    #tf.nn.embedding_lookup查找输入的train_inputs对应的向量embed.
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the NCE loss
    #权重参数
    nce_weights = tf.Variable(   #every word has a corresponding nce_weight ad nce_biase
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    #偏置
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

 # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  #使用tf.nn.nce_loss计算学习出的词向量embedding在训练数据上的loss,并使用tf.reduce_mean进行汇总
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, inputs=embed, labels=train_labels,
                     num_sampled=num_sampled, num_classes=vocabulary_size))#关于nce_loss的介绍在文章最后

  # Construct the SGD optimizer using a learning rate of 1.0.
  #优化器为SGD,学习素来为1.0
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  #计算嵌入向量embeeddings的L2范数norm
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  #再将embeeddings除以其L2范数得到标准化的normalized_embeddings
  normalized_embeddings = embeddings / norm
  #再使用tf.nn.embedding_lookup查询验证单词的嵌入向量，并
  valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
  #并计算验证单词的嵌入向量和词汇表中所有单词的相似性
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.初始化所有模型参数
  init = tf.global_variables_initializer()

# Step 5: Begin training.
num_steps = 100001

temp=tf.Variable(tf.random_normal(shape=[6]))
# 保存操作


with tf.Session(graph=graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")
  summary_writer = tf.summary.FileWriter('./Out', session.graph)
  saver = tf.train.Saver(max_to_keep=5)

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}

    # 使用session.run执行一次优化器运算（即一次参数更新）和损失计算
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    #每2000次循环，计算平均loss并显示出来
    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0
      saver.save(session, "word2vec", global_step=step)


    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    #每10000次循环，计算一次验证单词与全部单词的相似度，并将与每个验证单词最相似的8个单词展示出来
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log_str = "Nearest to %s:" % valid_word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)
  final_embeddings = normalized_embeddings.eval()


  # Step 6: Visualize the embeddings.
  #定义一个用来可视化word2vec效果的函数。
  #low_dim_embs是降维到2维的单词的空间向量，我们将在图片中显示每个单词的位置

  def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
      assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
      plt.figure(figsize=(18, 18))  # in inches
      for i, label in enumerate(labels):
          x, y = low_dim_embs[i, :]
          plt.scatter(x, y)
          #展示单词本身
          plt.annotate(label,
                       xy=(x, y),
                       xytext=(5, 2),
                       textcoords='offset points',
                       ha='right',
                       va='bottom')
      #保存文件
      plt.savefig(filename)


  try:
      from sklearn.manifold import TSNE
      import matplotlib.pyplot as plt

      #TSNE实现降维
      tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
      plot_only = 500
      #直接将原始的128维的嵌入向量降到2维，再用前面的plot_with_labels进行显示
      low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
      labels = [reverse_dictionary[i] for i in xrange(plot_only)]
      plot_with_labels(low_dim_embs, labels)

  except ImportError:
      print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")