maxlen = 1000 word_freqs = collections.Counter() # word_freqs = {} # print(word_freqs) with open('../data/NLP_data/news.txt', 'r+', encoding='utf8') as f: for line in f: words = line.lower().split(' ') if len(words) > maxlen: maxlen = len(words)
for word in words: ifnot (word in stop_words): word_freqs[word] += 1 # 词频统计 # count = word_freqs.get(word, 0) # print(count) # word_freqs[word] = count + 1
from sklearn.metrics.pairwise import cosine_similarity # 比较最后一句与其他句子的相似度 print(cosine_similarity(tfidf[-1], tfidf[:-1], dense_output=False))
这里需要注意的是sklearn计算TF-IDF公式有些许区别:
手动实现TF-IDF完整代码:
注意 :分子分母同时增加1 为了平滑处理、增加了归一化处理计算平方根。
# coding=utf-8 import math import numpy
corpus = [ 'what is the weather like today', 'what is for dinner tonight', 'this is a question worth pondering', 'it is a beautiful day today' ] words = [] # 对corpus分词 for i in corpus: words.append(i.split())
# 进行词频统计 defCounter(word_list): wordcount = [] for i in word_list: count = {} for j in i: ifnot count.get(j): count.update({j: 1}) elif count.get(j): count[j] += 1 wordcount.append(count) return wordcount
for i in wordcount: tf_idfs = 0 print('part:{}'.format(p)) p = p + 1 for j, k in i.items(): print('word: {} ---- TF-IDF:{}'.format(j, tfidf(j, i, wordcount)))
# 归一化 tf_idfs += (tfidf(j, i, wordcount) ** 2)
tf_idfs = tf_idfs ** 0.5 print(tf_idfs)
for j, k in i.items(): print('归一化后:word: {} ---- TF-IDF:{}'.format(j, tfidf(j, i, wordcount) / tf_idfs))
# break
'''
part:1 word: what ---- TF-IDF:0.04794701207529681 word: is ---- TF-IDF:-0.03719059188570162 word: the ---- TF-IDF:0.11552453009332421 word: weather ---- TF-IDF:0.11552453009332421 word: like ---- TF-IDF:0.11552453009332421 word: today ---- TF-IDF:0.04794701207529681 part:2 word: what ---- TF-IDF:0.05753641449035617 word: is ---- TF-IDF:-0.044628710262841945 word: for ---- TF-IDF:0.13862943611198905 word: dinner ---- TF-IDF:0.13862943611198905 word: tonight ---- TF-IDF:0.13862943611198905 part:3 word: this ---- TF-IDF:0.11552453009332421 word: is ---- TF-IDF:-0.03719059188570162 word: a ---- TF-IDF:0.04794701207529681 word: question ---- TF-IDF:0.11552453009332421 word: worth ---- TF-IDF:0.11552453009332421 word: pondering ---- TF-IDF:0.11552453009332421 part:4 word: it ---- TF-IDF:0.11552453009332421 word: is ---- TF-IDF:-0.03719059188570162 word: a ---- TF-IDF:0.04794701207529681 word: beautiful ---- TF-IDF:0.11552453009332421 word: day ---- TF-IDF:0.11552453009332421 word: today ---- TF-IDF:0.04794701207529681
with gzip.open (data_file, 'rb') as f: for i,line in enumerate (f): print(line) break
# 讀取 OpinRank 語料庫,並作前置處理 defread_input(input_file): with gzip.open (input_file, 'rb') as f: for i, line in enumerate (f): # 前置處理 yield gensim.utils.simple_preprocess(line)
# 關鍵詞萃取(Keyword Extraction) # https:///gensim_3.8.3/summarization/keywords.html # from gensim.summarization import keywords
# # 測試語料 # text = '''Challenges in natural language processing frequently involve # speech recognition, natural language understanding, natural language # generation (frequently from formal, machine-readable logical forms), # connecting language and machine perception, dialog systems, or some # combination thereof.'''
# 關鍵詞萃取 # print(''.join(keywords(text)))
(6)文档-向量模型 Doc2vec
Doc2vec模型是受到了Word2Vec模型的启发。Word2Vec里预测词向量时,预测出来的词是含有词义的,Doc2vec中也是构建了相同的结构,所以Doc2vec克服了词袋模型中没有语义的缺点。假设现在存在训练样本,每个句子是训练样本,和Word2Vec一样,Doc2vec也有两种训练方式,一种是分布记忆的段落向量(Distributed Memory Model of Paragraph Vectors , PV-DM)类似于Word2Vec中的CBOW模型,另一种是分布词袋版本的段落向量(Distributed Bag of Words version of Paragraph Vector,PV-DBOW)类似于Word2Vec中的Skip-gram模型。
# coding=utf-8 import numpy as np import nltk import gensim from gensim.models import word2vec from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.metrics.pairwise import cosine_similarity
f = open('../data/FAQ/starbucks_faq.txt', 'r', encoding='utf8') corpus = f.readlines()
# 測試 questions = [] for i in range(len(document_tokens)): questions.append(model_d2v.infer_vector(document_tokens[i])) questions = np.array(questions) # print(questions.shape)
# 測試語句 # text = 'find allergen information' # text = 'mobile pay' text = 'verification code' filtered_tokens = tokenize(text, stopword_list) # print(filtered_tokens)