赞
踩
梳理一下tf-idf的全过程,然后用于实际的需求中。
从数据的更新计算上,将TF-IDF分为两部分:一部分用于计算IDF的增量部分,属于全局的学习;另一部分则用于批量处理新的数据集,相当于是在predict。
可以把IDF视为一个预训练模型,纳入建模的数据管理系统中统一管理和维护。
流程上:
由于IDF信息的累积是要遍历多个文档之后形成的,所以这部分是独立的。
from Basefuncs import *
import tqdm
# 获取多个批次要训练的数据
folder_path = '/DataPath/converted_data2/'
pkl_file_list = list(list_file_names_without_extension(folder_path))
pkl_file_list = sorted(pkl_file_list)
pkl_dict = {}
for some_pkl_file in tqdm.tqdm(pkl_file_list):
pkl_dict[some_pkl_file] = from_pickle(some_pkl_file, folder_path)
准备好训练用的函数,主要用于清洗、分析和idf的统计
import jieba as jb
import re
# 删除掉匹配模式之间的
def del_pat6_pro(x, swd, ewd):
wd_str = swd + '.*' + ewd
return re.sub(wd_str, '' , x)
puncs = r'[_,.。,!??;::|、-]+'
def clean_alpha_punc(x):
res = del_pat6_pro(x, '<http', '>')
res = re.sub(r'\d+', ' ', res) # 删除数字并替换为空格
res = re.sub(puncs, ' ', res) # 删除指定的标点符
res_list = jb.lcut(res)
res = [x for x in res_list if x !=' ' and len(x) > 1]
return res
def clean_a_bit(x):
res = del_pat6_pro(x, '<http', '>')
res = re.sub(puncs,' ',res)
return res
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
def get_idf_dict(corpus = None):
corpus_s = pd.Series(corpus)
# TF calculation
corpus_s1 = corpus_s.apply(lambda x: dict(Counter(x)))
doc_words = corpus_s.apply(len)
# IDF calculation
corpus_s2 = corpus_s1.apply(lambda x: list(x.keys()))
idf_dict = Counter(chain.from_iterable(corpus_s2))
return idf_dict
import pandas as pd
def increadd_dict(master_dict = None, slave_dict = None):
s1 = pd.Series(master_dict)
s2 = pd.Series(slave_dict)
s3 = s1+s2
add_key = dict(s3.dropna())
master_dict.update(slave_dict)
master_dict.update(add_key)
mod_keys = list(s2.keys()) # 可以并行
return master_dict, mod_keys
def idf_train(idf_model = None, data_list = None, pid_list = None):
pid_set = idf_model['pid_set']
idf_dict = idf_model['idf_dict']
gap_set = set(pid_list) - set(pid_set)
if len(gap_set):
print('updating %s recs ' % len(gap_set))
else:
print('No UPDATING')
filter_data_list = []
for i, v in enumerate(pid_list):
if v in gap_set:
filter_data_list.append(data_list[i])
new_idf_dict = get_idf_dict(filter_data_list)
_idf_dict,mod_keys = increadd_dict(idf_dict, new_idf_dict)
idf_model['idf_dict'] = _idf_dict
idf_model['pid_set'] = pid_set | gap_set
return idf_model
开始批次训练,idf_model有两部分pid_set
和idf_dict
,这里假设是一个纯新的模型训练。
idf_model = {'pid_set': set([]), 'idf_dict' : {}}
for some_pkl_file in tqdm.tqdm(pkl_file_list):
tem_df = pkl_dict[some_pkl_file]
tem_df['MD5'] = tem_df['content_md5'].apply(lambda x: x.upper())
tem_df['wd_list'] = tem_df['clean_content'].apply(lambda x: clean_alpha_punc(str(x)))
idf_model = idf_train(idf_model = idf_model, data_list = list(tem_df['wd_list']) , pid_list = list(tem_df['MD5']))
对40个批次,约40万篇新闻进行训练
0%| | 0/40 [00:00<?, ?it/s]
updating 9946 recs
2%|▎ | 1/40 [00:48<31:15, 48.10s/it]
updating 9822 recs
5%|▌ | 2/40 [01:30<28:26, 44.91s/it]
updating 9915 recs
8%|▊ | 3/40 [02:03<24:23, 39.54s/it]
updating 9853 recs
10%|█ | 4/40 [02:45<24:18, 40.50s/it]
updating 9781 recs
12%|█▎ | 5/40 [03:31<24:41, 42.33s/it]
updating 9839 recs
15%|█▌ | 6/40 [04:17<24:44, 43.65s/it]
updating 9872 recs
18%|█▊ | 7/40 [05:01<23:57, 43.55s/it]
updating 9753 recs
20%|██ | 8/40 [05:45<23:18, 43.71s/it]
updating 9789 recs
22%|██▎ | 9/40 [06:25<21:59, 42.57s/it]
updating 9948 recs
25%|██▌ | 10/40 [07:08<21:22, 42.76s/it]
updating 9954 recs
28%|██▊ | 11/40 [07:55<21:18, 44.09s/it]
updating 9963 recs
30%|███ | 12/40 [08:36<20:10, 43.22s/it]
updating 9959 recs
32%|███▎ | 13/40 [09:16<18:57, 42.13s/it]
updating 9880 recs
35%|███▌ | 14/40 [09:57<18:07, 41.83s/it]
updating 9965 recs
38%|███▊ | 15/40 [10:38<17:22, 41.70s/it]
updating 9938 recs
40%|████ | 16/40 [11:27<17:28, 43.70s/it]
updating 9918 recs
42%|████▎ | 17/40 [12:14<17:07, 44.68s/it]
updating 9949 recs
45%|████▌ | 18/40 [13:04<17:00, 46.37s/it]
updating 9960 recs
48%|████▊ | 19/40 [13:45<15:41, 44.84s/it]
updating 9967 recs
50%|█████ | 20/40 [14:27<14:38, 43.90s/it]
updating 9960 recs
52%|█████▎ | 21/40 [15:09<13:45, 43.43s/it]
updating 9964 recs
55%|█████▌ | 22/40 [15:52<12:59, 43.32s/it]
updating 9950 recs
57%|█████▊ | 23/40 [16:49<13:22, 47.22s/it]
updating 9945 recs
60%|██████ | 24/40 [17:31<12:14, 45.88s/it]
updating 9783 recs
62%|██████▎ | 25/40 [18:22<11:47, 47.20s/it]
updating 4158 recs
65%|██████▌ | 26/40 [18:42<09:06, 39.04s/it]
updating 9967 recs
68%|██████▊ | 27/40 [19:26<08:48, 40.66s/it]
updating 9972 recs
70%|███████ | 28/40 [20:08<08:14, 41.17s/it]
updating 9976 recs
72%|███████▎ | 29/40 [20:55<07:49, 42.70s/it]
updating 9946 recs
75%|███████▌ | 30/40 [21:47<07:34, 45.49s/it]
updating 9969 recs
78%|███████▊ | 31/40 [22:27<06:34, 43.79s/it]
updating 9971 recs
80%|████████ | 32/40 [23:11<05:51, 43.92s/it]
updating 9952 recs
82%|████████▎ | 33/40 [23:58<05:14, 44.93s/it]
updating 1687 recs
85%|████████▌ | 34/40 [24:08<03:26, 34.38s/it]
updating 9803 recs
88%|████████▊ | 35/40 [24:52<03:06, 37.38s/it]
updating 9830 recs
90%|█████████ | 36/40 [25:45<02:47, 41.94s/it]
updating 9809 recs
92%|█████████▎| 37/40 [26:32<02:10, 43.40s/it]
updating 9753 recs
95%|█████████▌| 38/40 [27:21<01:30, 45.32s/it]
updating 9781 recs
98%|█████████▊| 39/40 [28:08<00:45, 45.78s/it]
updating 9776 recs
100%|██████████| 40/40 [28:57<00:00, 43.43s/it]
大约半个小时完成训练,此时保存idf_model为pkl文件,训练过程就结束了。
未来在进行增量训练时,可以将idf_model重新载入,继续执行新的批次就可以了。
获取数据并进行清洗和分词,这个过程耗时比较长,约40秒完成一万个新闻的清洗分词。
tem_df = pkl_dict[some_pkl_file]
tick1 = time.time()
data_list = list(tem_df['clean_content'].apply(clean_alpha_punc))
pid_list = list(tem_df['MD5'])
tick2 = time.time()
print(tick2-tick1)
42.98098278045654
接下来,假设新数据是有一个data_list,并有一个pid_list与之对应。调用cal_tfidf
就可以计算tf_df了。
def dict2df(some_dict):
data_part = some_dict['wd_dict']
id_part = some_dict['pid']
if len(data_part):
tem_df = pd.Series(data_part).reset_index()
tem_df.columns = ['wd','tf']
tem_df['tf'] = tem_df['tf']/tem_df['tf'].sum()
tem_df['pid'] = id_part
else:
tem_df = pd.DataFrame([{'wd':' ','tf':1}] ,columns = ['wd','tf'])
tem_df['pid'] = id_part
return tem_df
def cal_tfidf(data_list= None,pid_list = None, idf_model = None, top_n =10 ):
# 模型部分
model_doc_len = len(idf_model['pid_set'])
idf_dict = idf_model['idf_dict']
idf_df = pd.Series(idf_dict).reset_index()
idf_df.columns = ['wd', 'idf']
corpus_s = pd.Series(data_list)
# TF calculation
corpus_s1= corpus_s.apply(lambda x: dict(Counter(x)))
_tem_df = pd.DataFrame()
_tem_df['pid'] = list(pid_list)
_tem_df['wd_dict'] = list(corpus_s1)
_s = cols2s(_tem_df, cols=['pid', 'wd_dict'] , cols_key_mapping= ['pid', 'wd_dict'])
_s1 = _s.apply(dict2df)
_df1 = pd.concat(_s1.tolist(), ignore_index=True)
_df2 = pd.merge(_df1, idf_df, how='left', on ='wd')
_df2['idf'] = _df2['idf'] .fillna(model_doc_len)
_df2['tf_idf'] = _df2['tf'] * np.log(model_doc_len) / _df2['idf']
_df3 = _df2.sort_values(['tf_idf'], ascending= False).groupby(['pid']).head(top_n).sort_values(['pid','tf_idf'], ascending=[True,False])
return _df3
计算过程不算特别快,但是比分词要快一些。总共耗时约20秒。
tick1 = time.time()
tfidf_df = cal_tfidf(data_list=data_list,pid_list=pid_list, idf_model= idf_model, top_n = 10)
tick2 = time.time()
18.422481060028076
得到的结果如下,这样做的目的主要是为了保留更多的信息供后续分析
wd tf pid idf tf_idf
1277666 日坤恒 0.017544 0006678CAC3EA76A62E23589002650B6 3.0 0.075164
1277667 顺维 0.017544 0006678CAC3EA76A62E23589002650B6 3.0 0.075164
1277650 恒顺维 0.017544 0006678CAC3EA76A62E23589002650B6 49.0 0.004602
1277665 折价 0.017544 0006678CAC3EA76A62E23589002650B6 2571.0 0.000088
1277669 一览 0.017544 0006678CAC3EA76A62E23589002650B6 2985.0 0.000076
... ... ... ... ... ...
如果要数据每篇文章的特征词
tfidf_df['wd1'] = tfidf_df['wd'] + ','
tfidf_df.groupby(['pid'])['wd1'].sum()
pid
0006678CAC3EA76A62E23589002650B6 日坤恒,顺维,恒顺维,折价,一览,交易平台,大宗,成交,收盘价,成交量,
000A7DE1E813A460992B58471638A313 景嘉微,华鑫,国产化,频频,模块,算力,国产,终端,芯片,研报,
000CDA4BEA946E91F410796FD04F36C4 慧聪,群项,持作,中于,支出额,小额贷款,神州数码,无形资产,流动资产,中关村,
0016B648D7A4E1002F99F88FFE3F5C90 富邦华,花旗银行,交割日,花旗,信用卡,结清,银行业务,欠款,还款,分期,
001A50AA0100A16B6115FD9FF5820B2C 捐送,惠若琪,惠达,恩泽,捐资助学,卫浴,女排,危难,如一日,韩峰,
这样就完成了预测。
接下来还有两部分改进工作:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。