赞
踩
c
o
s
(
θ
)
=
A
B
2
+
A
C
2
−
B
C
2
2
∗
A
B
∗
A
C
c
o
s
(
θ
)
=
a
⃗
∗
b
⃗
∥
a
∥
∗
∥
b
∥
import numpy as np def preprocess(text): """ 语料库预处理 :param text:句子字符串 :return: corpus 是单词ID 列表 word_to_id:是单词到单词 ID 的字典 id_to_word 是单词 ID 到单词的字典 """ text = text.lower().replace('.', ' .') # 单词全为小写 words = text.split(' ') # 以空格分隔 word_to_id = {} id_to_word = {} for word in words: if word not in word_to_id: new_id = len(word_to_id) word_to_id[word] = new_id id_to_word[new_id] = word corpus = np.array([word_to_id[w] for w in words]) return corpus, word_to_id, id_to_word def create_co_matrix(corpus, vocab_size, window_size=1): """ 语料库生成共现矩阵 :param corpus:corpus 是单词 ID 列表 :param vocab_size:词汇个数 :param window_size:窗口大小 :return: 共现矩阵 """ corpus_size = len(corpus) co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32) for idx, word_id in enumerate(corpus): for i in range(1, window_size + 1): left_idx = idx - i right_idx = idx + i if left_idx >= 0: left_word_id = corpus[left_idx] co_matrix[word_id, left_word_id] += 1 if right_idx < corpus_size: right_word_id = corpus[right_idx] co_matrix[word_id, right_word_id] += 1 return co_matrix def cos_similarity(x, y, eps=1e-8): """ 余弦相似度函数 :param x:x坐标值 :param y:y坐标值 :param eps:默认值为1e-8,防止分母为0 :return: 余弦相似度值 """ nx = x / (np.sqrt(np.sum(x ** 2)) + eps) ny = y / (np.sqrt(np.sum(y ** 2)) + eps) return np.dot(nx, ny) text = 'I say hello and You say goodbye.' corpus, word_to_id, id_to_word = preprocess(text) print("corpus为:",corpus) print("word_to_id为:",word_to_id) print("id_to_word为:",id_to_word) vocab_size=len(set(corpus)) C=create_co_matrix(corpus, vocab_size, window_size=1) print("共现矩阵为:",C) c0 = C[word_to_id['you']] # you的单词向量 c1 = C[word_to_id['i']] # i的单词向量 print('you和i的相似度为',cos_similarity(c0, c1))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。