赞
踩
数据:Kaggle提供的Quora数据集:FAQ Kaggle dataset! | Data Science and Machine Learning。有字段Index(['Questions', 'Followers', 'Answered', 'Link'], dtype='object')
。
把Link当做答案构造数据对。基本的流程如下:
当question很多则不适合直接将embedding存入。
(1)使用docker直接运行:
docker run -p 6379:6379 -it redis/redis-stack:latest
执行后,docker会自动从hub把镜像拉到本地,默认是6379端口。
(2)安装redis的python客户端pip install redis
用Python和Redis进行交互了
(3)建立索引(定义一组schema,告诉redis你的字段、属性),生成embedding存入redis
import openai from openai.embeddings_utils import get_embedding, cosine_similarity from redis.commands.search.query import Query from redis.commands.search.field import TextField, VectorField import redis import pandas as pd import numpy as np OPENAI_API_KEY = '...' openai.api_key = OPENAI_API_KEY MODEL = "gpt-3.5-turbo" # redis test # docker拉取镜像, 默认是6379端口号 r = redis.Redis() r.set("key", "value") r.get("key") # 1. 建立索引 VECTOR_DIM = 12288 INDEX_NAME = "faq" # 2. 建好要存字段的索引,针对不同属性字段,使用不同Field question = TextField(name="question") answer = TextField(name="answer") embedding = VectorField( name="embedding", algorithm="HNSW", attributes={ "TYPE": "FLOAT32", "DIM": VECTOR_DIM, "DISTANCE_METRIC": "COSINE" } ) schema = (question, embedding, answer) index = r.ft(INDEX_NAME) try: info = index.info() except: index.create_index(schema) # 3. 如果需要删除已有文档的话,可以使用下面的命令 index.dropindex(delete_documents=True) # 4. 把数据存储到redis中 df = pd.read_csv("/home/andy/torch_rechub_n/hug_llm/content/dataset/Kaggle related questions on Qoura - Questions.csv") # df.shape for v in df.head().itertuples(): emb = get_embedding(v.Questions) # 注意,redis要存储bytes或string emb = np.array(emb, dtype=np.float32).tobytes() im = { "question": v.Questions, "embedding": emb, "answer": v.Link } # 重点是这句 r.hset(name=f"{INDEX_NAME}-{v.Index}", mapping=im) # 5. 构造查询输入 query = "kaggle alive?" embed_query = get_embedding(query) params_dict = {"query_embedding": np.array(embed_query).astype(dtype=np.float32).tobytes()} k = 3 base_query = f"* => [KNN {k} @embedding $query_embedding AS similarity]" return_fields = ["question", "answer", "similarity"] query = ( Query(base_query) .return_fields(*return_fields) .sort_by("similarity") .paging(0, k) .dialect(2) ) # 6. 查询 res = index.search(query, params_dict) for i,doc in enumerate(res.docs): score = 1 - float(doc.similarity) print(f"{doc.id}, {doc.question}, {doc.answer} (Score: {round(score ,3) })")
暂略。
QA使用的是用户的Question去匹配已有知识库,而推荐是使用用户的浏览记录去匹配。但是很明显,推荐相比QA要更复杂一些,主要包括以下几个方面:
整体设计如下:
数据:AG_news新闻数据集,每条数据有四个字段’Class Index’, ‘Title’, ‘Description’, ‘embedding’,其中类型四个分别是:1-World, 2-Sports, 3-Business, 4-Sci/Tech,AG News Classification Dataset | Kaggle
任务:基于新闻推荐的一个简单召回
@dataclass
是python3.7版本以上出的一个装饰器,可以简化数据类的定义,自动为类添加__init__
、__repr__
、__eq__
等方法属性,但是@dataclass
装饰器生成的方法和属性可能不能满足所有需求,这时需要手动编写。
from dataclasses import dataclass import pandas as pd from typing import List from openai.embeddings_utils import get_embedding, cosine_similarity from sklearn.metrics.pairwise import cosine_similarity import openai import numpy as np import random # 1. 观察数据 df = pd.read_csv("/home/andy/torch_rechub_n/hug_llm/content/dataset/AG_News.csv") df.shape # 1-World, 2-Sports, 3-Business, 4-Sci/Tech df["Class Index"].value_counts() # 每个类别都是3w sdf = df.sample(100) # 抽样只用100条数据 # 2. 维护一个用户偏好和行为记录 @dataclass class User: user_name: str @dataclass class UserPrefer: user_name: str prefers: List[int] @dataclass class Item: item_id: str item_props: dict @dataclass class Action: action_type: str action_props: dict @dataclass class UserAction: user: User item: Item action: Action action_time: str # 3. 一个用户的历史记录 u1 = User("u1") up1 = UserPrefer("u1", [1, 2]) # sdf.iloc[1] 正好是sport(类别为2) i1 = Item("i1", { "id": 1, "catetory": "sport", "title": "Swimming: Shibata Joins Japanese Gold Rush", "description": "\ ATHENS (Reuters) - Ai Shibata wore down French teen-ager Laure Manaudou to win the women's 800 meters \ freestyle gold medal at the Athens Olympics Friday and provide Japan with their first female swimming \ champion in 12 years.", "content": "content" }) a1 = Action("浏览", { "open_time": "2023-04-01 12:00:00", "leave_time": "2023-04-01 14:00:00", "type": "close", "duration": "2hour" }) ua1 = UserAction(u1, i1, a1, "2023-04-01 12:00:00") # 4. 计算所有文本的embedding OPENAI_API_KEY = "..." openai.api_key = OPENAI_API_KEY sdf["embedding"] = sdf.apply(lambda x: get_embedding(x.Title + x.Description, engine="text-embedding-ada-002"), \ axis=1) # 5. recall 召回 class Recall: def __init__(self, df: pd.DataFrame): self.data = df def user_prefer_recall(self, user, n): up = self.get_user_prefers(user) idx = random.randrange(0, len(up.prefers)) return self.pick_by_idx(idx, n) def hot_recall(self, n): # 随机进行示例 df = self.data.sample(n) return df def user_action_recall(self, user, n): actions = self.get_user_actions(user) interest = self.get_most_interested_item(actions) recoms = self.recommend_by_interest(interest, n) return recoms def get_most_interested_item(self, user_action): """ 可以选近一段时间内用户交互时间、次数、评论(相关属性)过的Item """ # 就是sdf的第2行,idx为1的那条作为最喜欢(假设) # 是一条游泳相关的Item idx = user_action.item.item_props["id"] im = self.data.iloc[idx] return im def recommend_by_interest(self, interest, n): cate_id = interest["Class Index"] q_emb = interest["embedding"] # 确定类别 base = self.data[self.data["Class Index"] == cate_id] # 此处可以复用QA那一段代码,用给定embedding计算base中embedding的相似度 base_arr = np.array( [v.embedding for v in base.itertuples()] ) q_arr = np.expand_dims(q_emb, 0) sims = cosine_similarity(base_arr, q_arr) # 排除掉自己 idxes = sims.argsort(0).squeeze()[-(n + 1):-1] return base.iloc[reversed(idxes.tolist())] def pick_by_idx(self, category, n): df = self.data[self.data["Class Index"] == category] return df.sample(n) def get_user_actions(self, user): dct = {"u1": ua1} return dct[user.user_name] def get_user_prefers(self, user): dct = {"u1": up1} return dct[user.user_name] def run(self, user): ur = self.user_action_recall(user, 5) if len(ur) == 0: ur = self.user_prefer_recall(user, 5) hr = self.hot_recall(3) # 拼接用户召回+热点召回 return pd.concat([ur, hr], axis=0) r = Recall(sdf) rd = r.run(u1) # 共8个,5个用户行为推荐、3个热门
用户行为召回、热点召回的结果:
[1] openai-cookbook/Semantic_text_search_using_embeddings.ipynb at main · openai/openai-cookbook
[2] openai-cookbook/getting-started-with-redis-and-openai.ipynb at main · openai/openai-cookbook
[3] openai-cookbook/Visualizing_embeddings_in_3D.ipynb at main · openai/openai-cookbook
[4] https://github.com/datawhalechina/hugging-llm
[5] facebookresearch/faiss: A library for efficient similarity search and clustering of dense vectors.
[6] milvus-io/milvus: Vector database for scalable similarity search and AI applications.
[7] Vector similarity | Redis
[8] https://redis.io/docs/stack/search/reference/stopwords/
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。