赞
踩
通常SVD矩阵分解指的是SVD(奇异值)分解技术,在这我们姑且将其命名为Traditional SVD。其公式如下图所示:
Traditional SVD分解的形式为3个矩阵相乘,中间矩阵为奇异值矩阵。如果想运用SVD分解的话,有一个前提是要求矩阵是稠密的,即矩阵里的元素要非空,否则就不能运用SVD分解。
很显然我们的数据其实绝大多数情况下都是稀疏的,因此如果要使用Traditional SVD,一般的做法是先用均值或者其他统计学方法来填充矩阵,然后再运用Traditional SVD分解降维,但这样做明显对数据的原始性造成一定影响。
LFM也就是前面提到的Funk SVD矩阵分解。
LFM(latent factor model)隐语义模型核心思想是通过隐含特征联系用户和物品,如下图所示:
P矩阵是User-LF矩阵,即用户和隐含特征矩阵。LF有三个,表示共总有三个隐含特征。
Q矩阵是LF-Item矩阵,即隐含特征和物品的矩阵。
R矩阵是User-Item矩阵,有P*Q得到。
LFM能处理稀疏评分矩阵。
利用矩阵分解技术,将原始User-Item的评分矩阵(稠密/稀疏)分解为P和Q矩阵,然后利用 P ∗ Q P*Q P∗Q还原出User-Item评分矩阵 R R R。整个过程相当于降维处理,其中:
矩阵值
P
11
P_{11}
P11表示用户1对隐含特征1的权重值。
矩阵值
Q
11
Q_{11}
Q11表示隐含特征1在物品1上的权重值。
矩阵值
R
11
R_{11}
R11就表示预测的用户1对物品1的评分,且
R
11
=
P
1
,
k
⃗
⋅
Q
k
,
1
⃗
R_{11}=\vec{P_{1,k}}\cdot \vec{Q_{k,1}}
R11=P1,k
⋅Qk,1
。
利用LFM预测用户对物品的评分,
k
k
k表示隐含特征数量:
import pandas as pd import numpy as np # 评分预测 1-5 class LFM(object): def __init__(self, alpha, reg_p, reg_q, number_LatentFactors=10, number_epochs=10, columns=["uid", "iid", "rating"]): self.alpha = alpha # 学习率 self.reg_p = reg_p # P矩阵正则 self.reg_q = reg_q # Q矩阵正则 self.number_LatentFactors = number_LatentFactors # 隐式类别数量 self.number_epochs = number_epochs # 最大迭代次数 self.columns = columns def fit(self, dataset): ''' fit dataset :param dataset: uid, iid, rating :return: ''' self.dataset = pd.DataFrame(dataset) self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]] self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]] self.globalMean = self.dataset[self.columns[2]].mean() self.P, self.Q = self.sgd() def _init_matrix(self): ''' 初始化P和Q矩阵,同时为设置0,1之间的随机值作为初始值 :return: ''' # User-LF P = dict(zip( self.users_ratings.index, np.random.rand(len(self.users_ratings), self.number_LatentFactors).astype(np.float32) )) # Item-LF Q = dict(zip( self.items_ratings.index, np.random.rand(len(self.items_ratings), self.number_LatentFactors).astype(np.float32) )) return P, Q def sgd(self): ''' 使用随机梯度下降,优化结果 :return: ''' P, Q = self._init_matrix() for i in range(self.number_epochs): print("iter%d"%i) error_list = [] for uid, iid, r_ui in self.dataset.itertuples(index=False): # User-LF P ## Item-LF Q v_pu = P[uid] #用户向量 v_qi = Q[iid] #物品向量 err = np.float32(r_ui - np.dot(v_pu, v_qi)) v_pu += self.alpha * (err * v_qi - self.reg_p * v_pu) v_qi += self.alpha * (err * v_pu - self.reg_q * v_qi) P[uid] = v_pu Q[iid] = v_qi # for k in range(self.number_of_LatentFactors): # v_pu[k] += self.alpha*(err*v_qi[k] - self.reg_p*v_pu[k]) # v_qi[k] += self.alpha*(err*v_pu[k] - self.reg_q*v_qi[k]) error_list.append(err ** 2) print(np.sqrt(np.mean(error_list))) return P, Q def predict(self, uid, iid): # 如果uid或iid不在,我们使用全剧平均分作为预测结果返回 if uid not in self.users_ratings.index or iid not in self.items_ratings.index: return self.globalMean p_u = self.P[uid] q_i = self.Q[iid] return np.dot(p_u, q_i) def test(self,testset): '''预测测试集数据''' for uid, iid, real_rating in testset.itertuples(index=False): try: pred_rating = self.predict(uid, iid) except Exception as e: print(e) else: yield uid, iid, real_rating, pred_rating if __name__ == '__main__': dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)] dataset = pd.read_csv("ml-latest-small/ratings.csv", usecols=range(3), dtype=dict(dtype)) lfm = LFM(0.02, 0.01, 0.01, 10, 100, ["userId", "movieId", "rating"]) lfm.fit(dataset) while True: uid = input("uid: ") iid = input("iid: ") print(lfm.predict(int(uid), int(iid)))
import math import random import pandas as pd import numpy as np class BiasSvd(object): def __init__(self, alpha, reg_p, reg_q, reg_bu, reg_bi, number_LatentFactors=10, number_epochs=10, columns=["uid", "iid", "rating"]): self.alpha = alpha # 学习率 self.reg_p = reg_p self.reg_q = reg_q self.reg_bu = reg_bu self.reg_bi = reg_bi self.number_LatentFactors = number_LatentFactors # 隐式类别数量 self.number_epochs = number_epochs self.columns = columns def fit(self, dataset): ''' fit dataset :param dataset: uid, iid, rating :return: ''' self.dataset = pd.DataFrame(dataset) self.users_ratings = dataset.groupby(self.columns[0]).agg([list])[[self.columns[1], self.columns[2]]] self.items_ratings = dataset.groupby(self.columns[1]).agg([list])[[self.columns[0], self.columns[2]]] self.globalMean = self.dataset[self.columns[2]].mean() self.P, self.Q, self.bu, self.bi = self.sgd() def _init_matrix(self): ''' 初始化P和Q矩阵,同时为设置0,1之间的随机值作为初始值 :return: ''' # User-LF P = dict(zip( self.users_ratings.index, np.random.rand(len(self.users_ratings), self.number_LatentFactors).astype(np.float32) )) # Item-LF Q = dict(zip( self.items_ratings.index, np.random.rand(len(self.items_ratings), self.number_LatentFactors).astype(np.float32) )) return P, Q def sgd(self): ''' 使用随机梯度下降,优化结果 :return: ''' P, Q = self._init_matrix() # 初始化bu、bi的值,全部设为0 bu = dict(zip(self.users_ratings.index, np.zeros(len(self.users_ratings)))) bi = dict(zip(self.items_ratings.index, np.zeros(len(self.items_ratings)))) for i in range(self.number_epochs): print("iter%d"%i) error_list = [] for uid, iid, r_ui in self.dataset.itertuples(index=False): v_pu = P[uid] v_qi = Q[iid] err = np.float32(r_ui - self.globalMean - bu[uid] - bi[iid] - np.dot(v_pu, v_qi)) v_pu += self.alpha * (err * v_qi - self.reg_p * v_pu) v_qi += self.alpha * (err * v_pu - self.reg_q * v_qi) P[uid] = v_pu Q[iid] = v_qi bu[uid] += self.alpha * (err - self.reg_bu * bu[uid]) bi[iid] += self.alpha * (err - self.reg_bi * bi[iid]) error_list.append(err ** 2) print(np.sqrt(np.mean(error_list))) return P, Q, bu, bi def predict(self, uid, iid): if uid not in self.users_ratings.index or iid not in self.items_ratings.index: return self.globalMean p_u = self.P[uid] q_i = self.Q[iid] return self.globalMean + self.bu[uid] + self.bi[iid] + np.dot(p_u, q_i) if __name__ == '__main__': dtype = [("userId", np.int32), ("movieId", np.int32), ("rating", np.float32)] dataset = pd.read_csv("ml-latest-small/ratings.csv", usecols=range(3), dtype=dict(dtype)) bsvd = BiasSvd(0.02, 0.01, 0.01, 0.01, 0.01, 10, 20) bsvd.fit(dataset) while True: uid = input("uid: ") iid = input("iid: ") print(bsvd.predict(int(uid), int(iid)))
similar_users = self.similar[uid].drop([uid]).dropna().sort_values(ascending=False)[:self.k]
similar_items = self.similar[iid].drop([iid]).dropna().sort_values(ascending=False)[:self.k]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。