当前位置:   article > 正文

PSM+DID 效果评估python demo 、线性分类模型+双重差分法_psmdid代码

psmdid代码

需求背景:

        策略不适用随机分流,在某部分人群全量上线,需要同通过构建相似人群的方式,对策略进行评估。

评估方案:

        1、使用PSM构建相似人群,确保实验组与对照组在AA期的评估指标趋势能够保持一致

        2、通过DID对实验效果进行评估,确认策略对实验组的影响。

一、构建相似人群

1.1环境包导入

  1. import warnings
  2. warnings.filterwarnings('ignore')
  3. # from pymatch.Matcher import Matcher
  4. import pandas as pd
  5. import numpy as np
  6. %matplotlib inline
  7. from scipy import stats
  8. import matplotlib.pyplot as plt
  9. import patsy
  10. from statsmodels.genmod.generalized_linear_model import GLM
  11. import statsmodels.api as sm
  12. import seaborn as sns
  13. import sys
  14. import os
  15. from jinja2 import Template
  16. import prestodb
  17. from pyhive import hive
  18. from scipy import stats
  19. import patsy
  20. import seaborn as sns
  21. import matplotlib.pyplot as plt
  22. import matplotlib
  23. #指定默认字体
  24. matplotlib.rcParams['font.sans-serif'] = ['SimHei']
  25. matplotlib.rcParams['font.family']='sans-serif'
  26. #解决负号'-'显示为方块的问题
  27. matplotlib.rcParams['axes.unicode_minus'] = False
  28. def presto_read_sql_df(sql):
  29. conn=prestodb.dbapi.connect(
  30. host=os.environ['PRESTO_HOST'],
  31. port=os.environ['PRESTO_PORT'],
  32. user=os.environ['JUPYTER_HADOOP_USER'],
  33. password=os.environ['HADOOP_USER_PASSWORD'],
  34. catalog='hive')
  35. cur = conn.cursor()
  36. cur.execute(sql)
  37. query_result = cur.fetchall()
  38. colnames = [part[0] for part in cur.description]
  39. raw = pd.DataFrame(query_result, columns=colnames,dtype=np.float64)
  40. return raw
  41. def hive_read_sql_df(sql):
  42. conn = hive.connect(
  43. host=os.environ['PYHIVE_HOST'],
  44. port=os.environ['PYHIVE_PORT'],
  45. username=os.environ['JUPYTER_HADOOP_USER'],
  46. password=os.environ['HADOOP_USER_PASSWORD'],
  47. auth='LDAP',
  48. configuration={'mapreduce.job.queuename': os.environ['JUPYTER_HADOOP_QUEUE'],
  49. 'hive.resultset.use.unique.column.names':'false'})
  50. raw = pd.read_sql_query(sql,conn)
  51. return raw

1.2数据获取

  1. # 1.2.1通过SQL查询数据
  2. # df = hive_read_sql_df("""
  3. # select
  4. # XXXX
  5. # from
  6. # XXXXX
  7. # where
  8. # XXXXX
  9. # """)
  10. # df.to_csv('file_name.csv',encoding='gbk',sep=',',index = False)
  11. #1.2.2通过文件读取获得数据
  12. df = pd.read_csv('file_name.csv',encoding='gbk')
  13. y_date = df
  14. y_date.dtypes #check 数据的数据类型

1.3模型构建

  1. # 设定匹配因子和分组标签
  2. X_field=[
  3. 'XXXX' ,'XXXX','XXXX' #分组标签名称
  4. ]
  5. Y_field=['is_group']
  6. field=X_field+Y_field
  7. field
  1. # 循环字段 :根据需要选择,仅匹配一组,则不需要循环
  2. city_list = {
  3. # 'XXXX','XXXX','XXXX'
  4. }
  1. city_psm_dri = None
  2. for x in city_list :
  3. city_name = x
  4. print('='*20+'城市:'+city_name +'='*20)
  5. query_str = "city_name=='"+city_name+"'"
  6. city_info = y_date.query(query_str)
  7. #数据筛选,删除一些非必要的数据
  8. city_info = city_info.drop(columns=["city_name"]) #删除城市名称列
  9. city_info = city_info.query("is_strategy_time=='0'")
  10. city_info = city_info.query("XXX > 0")
  11. #归一化 非必须
  12. city_info.online_days = (city_info.online_days-city_info.online_days.min())/(city_info.online_days.max()-city_info.online_days.min())
  13. data=city_info
  14. data=data.dropna()
  15. # 区分实验/对照组样本
  16. treated=data[(data['is_group']==1)]
  17. control=data[(data['is_group']==0)]
  18. ttest_result_t=pd.DataFrame(stats.ttest_ind(treated,control))
  19. data_p = data
  20. # 构建一个线性模型
  21. print("-"*20+'模型定义'+"-"*20)
  22. y_f,x_f=patsy.dmatrices('{} ~ {}'.format(Y_field[0], '+'.join(X_field)), data=data_p, return_type='dataframe')
  23. formula = '{} ~ {}'.format(Y_field[0], '+'.join(X_field))
  24. print('Formula: '+formula)
  25. i=0
  26. nmodels=100
  27. errors=0
  28. model_accuracy = []
  29. models = []
  30. while i < nmodels and errors < 5:
  31. sys.stdout.write('\r{}: {}\{}'.format("Fitting Models on Balanced Samples", i, nmodels)) #第几个模型
  32. # sample from majority to create balance dataset
  33. df = control.sample(len(treated)).append(treated).dropna() #模型选择相同的对照组和控制组样本
  34. y_samp, X_samp = patsy.dmatrices(formula, data=df, return_type='dataframe') #选出模型的自变量和因变量
  35. glm = GLM(y_samp, X_samp, family=sm.families.Binomial()) #逻辑回归模型
  36. try:
  37. res = glm.fit()
  38. preds = [1.0 if i >= .5 else 0.0 for i in res.predict(X_samp)]
  39. preds=pd.DataFrame(preds)
  40. preds.columns=y_samp.columns
  41. b=y_samp.reset_index(drop=True)
  42. a=preds.reset_index(drop=True)
  43. ab_score=((a.sort_index().sort_index(axis=1) == b.sort_index().sort_index(axis=1)).sum() * 1.0 / len(y_samp)).values[0] # 模型预测准确性得分
  44. # print(' model_accuracy:{}'.format(ab_score))
  45. model_accuracy.append(ab_score)
  46. models.append(res)
  47. i += 1
  48. except Exception as e:
  49. errors += 1 # to avoid infinite loop for misspecified matrix
  50. print('\nError: {}'.format(e))
  51. print("\nAverage Accuracy:", "{}%".
  52. format(round(np.mean(model_accuracy) * 100, 2)))# 所有模型的平均准确性
  53. print('Fitting 1 (Unbalanced) Model...')
  54. if errors >= 5: ##异常城市
  55. print("【异常警告:】"+city_name+'数据出现异常,该城市数据未记录\n')
  56. city_info.to_csv(city_name+'.csv',encoding='gbk',sep=',',index = False)
  57. continue
  58. glm = GLM(y_f, x_f, family=sm.families.Binomial())
  59. res = glm.fit()
  60. # model_accuracy.append(self._scores_to_accuracy(res, x_f, y_f))
  61. preds = [1.0 if i >= .5 else 0.0 for i in res.predict(x_f)]
  62. preds=pd.DataFrame(preds)
  63. preds.columns=y_f.columns
  64. b=y_f.reset_index(drop=True)
  65. a=preds.reset_index(drop=True)
  66. ab_score=((a.sort_index().sort_index(axis=1) == b.sort_index().sort_index(axis=1)).sum() * 1.0 / len(y_f)).values[0]
  67. model_accuracy.append(ab_score)
  68. models.append(res)
  69. print("\nAccuracy", round(np.mean(model_accuracy[0]) * 100, 2))
  70. scores = np.zeros(len(x_f))
  71. for i in range(nmodels):
  72. m = models[i]
  73. scores += m.predict(x_f)
  74. data_p['scores'] = scores/nmodels
  75. # 绘图
  76. plt.figure(figsize=(10,5))
  77. sns.distplot(data_p[data_p[Y_field[0]]==0].scores, label='Control')
  78. sns.distplot(data_p[data_p[Y_field[0]]==1].scores, label='Test')
  79. plt.legend(loc='upper right')
  80. plt.xlim((0, 1))
  81. plt.title("Propensity Scores Before Matching")
  82. plt.ylabel("Percentage (%)")
  83. plt.xlabel("Scores")
  84. threshold=0.00001
  85. method='min' # 使用最近样本匹配方法
  86. nmatches=1
  87. test_scores = data_p[data_p[Y_field[0]]==True][['scores']]
  88. ctrl_scores = data_p[data_p[Y_field[0]]==False][['scores']]
  89. result, match_ids = [], []
  90. for i in range(len(test_scores)):
  91. #print(i)
  92. if i % 10 == 0:
  93. sys.stdout.write('\r{}: {}'.format("Fitting Samples", i ))
  94. # uf.progress(i+1, len(test_scores), 'Matching Control to Test...')
  95. match_id = i
  96. score = test_scores.iloc[i]
  97. if method == 'random':
  98. bool_match = abs(ctrl_scores - score) <= threshold
  99. matches = ctrl_scores.loc[bool_match[bool_match.scores].index]
  100. elif method == 'min':
  101. matches = abs(ctrl_scores - score).sort_values('scores').head(nmatches)
  102. else:
  103. raise(AssertionError, "Invalid method parameter, use ('random', 'min')")
  104. if len(matches) == 0:
  105. continue
  106. # randomly choose nmatches indices, if len(matches) > nmatches
  107. select = nmatches if method != 'random' else np.random.choice(range(1, max_rand+1), 1)
  108. chosen = np.random.choice(matches.index, min(select, nmatches), replace=False)
  109. # print(chosen)
  110. result.extend([test_scores.index[i]] + list(chosen))
  111. match_ids.extend([i] * (len(chosen)+1))
  112. ctrl_scores=ctrl_scores.drop(chosen,axis=0)
  113. matched_data =data_p.loc[result]
  114. matched_data['match_id'] = match_ids
  115. matched_data['record_id'] = matched_data.index
  116. print("\n匹配结果:")
  117. print(len(matched_data[matched_data['is_group']==0]['record_id'].unique()))
  118. print(len(matched_data[matched_data['is_group']==1]['record_id'].unique()))
  119. # m.plot_scores()
  120. plt.figure(figsize=(10,5))
  121. sns.distplot(matched_data[matched_data[Y_field[0]]==0].scores, label='Control')
  122. sns.distplot(matched_data[matched_data[Y_field[0]]==1].scores, label='Test')
  123. plt.legend(loc='upper right')
  124. plt.xlim((0, 1))
  125. plt.title("Propensity Scores After Matching")
  126. plt.ylabel("Percentage (%)")
  127. plt.xlabel("Scores")
  128. matched_data['city_name'] = city_name
  129. file_name = 'result'+city_name+'.csv'
  130. matched_data.to_csv(file_name,encoding='gbk',sep=',',index = False)
  131. if type(city_psm_dri) == type(None) :
  132. city_psm_dri = matched_data
  133. else :
  134. city_psm_dri = pd.concat([city_psm_dri,matched_data])
  135. city_psm_dri.to_csv('result.csv',encoding='gbk',sep=',',index = False)

二、DID效果评估

2.1环境包导入

  1. import statsmodels.formula.api as smf
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import matplotlib
  5. #指定默认字体
  6. matplotlib.rcParams['font.sans-serif'] = ['SimHei']
  7. matplotlib.rcParams['font.family']='sans-serif'
  8. #解决负号'-'显示为方块的问题
  9. matplotlib.rcParams['axes.unicode_minus'] = False
  10. import prestodb
  11. import os
  12. from pyhive import hive
  13. def presto_read_sql_df(sql):
  14. conn=prestodb.dbapi.connect(
  15. host=os.environ['PRESTO_HOST'],
  16. port=os.environ['PRESTO_PORT'],
  17. user=os.environ['JUPYTER_HADOOP_USER'],
  18. password=os.environ['HADOOP_USER_PASSWORD'],
  19. catalog='hive')
  20. cur = conn.cursor()
  21. cur.execute(sql)
  22. query_result = cur.fetchall()
  23. colnames = [part[0] for part in cur.description]
  24. raw = pd.DataFrame(query_result, columns=colnames,dtype=np.float64)
  25. return raw
  26. def hive_read_sql_df(sql):
  27. conn = hive.connect(
  28. host=os.environ['PYHIVE_HOST'],
  29. port=os.environ['PYHIVE_PORT'],
  30. username=os.environ['JUPYTER_HADOOP_USER'],
  31. password=os.environ['HADOOP_USER_PASSWORD'],
  32. auth='LDAP',
  33. configuration={'mapreduce.job.queuename': os.environ['JUPYTER_HADOOP_QUEUE'],
  34. 'hive.resultset.use.unique.column.names':'false'})
  35. raw = pd.read_sql_query(sql,conn)
  36. return raw

2.2数据读取与验证

  1. df = pd.read_csv('20220617171533.csv',encoding='gbk')
  2. print('shape:'+ str(df.shape))
  3. df.dtypes
  4. df.head().T
  5. df.describe()

 2.3增量评估

  1. city_list = {
  2. 'XXXX'
  3. }
  4. data = df
  5. city_num=0
  6. for city_name in city_list :
  7. city_num=city_num+1
  8. "city_name=='"+city_name+"'"
  9. city_info = data.query("city_name=='"+city_name+"'")
  10. print("="*20+str(city_num)+"="*20)
  11. # print(city_info.shape)
  12. value = list([city_info.tsh][0])
  13. group = list([city_info.is_group][0])
  14. strategy = list([city_info.is_strategy_time][0])
  15. tg = list([city_info.tg][0])
  16. aa = pd.DataFrame({'strategy':strategy,'group':group,'tg':tg,'value':value })
  17. X = aa[['strategy', 'group','tg']]
  18. y = aa['value']
  19. # print(X.shape)
  20. # print(y.shape)
  21. est = smf.ols(formula='value ~ strategy + group + tg ', data=aa).fit()
  22. y_pred = est.predict(X)
  23. aa['value_pred'] = y_pred
  24. #策略增量
  25. dat_tsh = est.params.tg / (est.params.strategy + est.params.group + est.params.Intercept )
  26. p_value = est.pvalues.tg
  27. if p_value < 0.05 :
  28. print(city_name+":△TSH = %.2f %%,效果显著 " % (dat_tsh*100))
  29. else :
  30. print(city_name+":△TSH = %.2f %%,效果不显著 " % (dat_tsh*100))
  31. # print(est.params)
  32. # print(est.pvalues)

2.4趋势绘图

  1. def get_trend_plot(matched_long_new,city_name):
  2. yx_trend = matched_long_new.groupby(['is_group','mon']).tsh.mean().reset_index()
  3. fig, axes = plt.subplots(1, 1,figsize=(10,4), sharex = False, subplot_kw=dict(frameon=False))
  4. x = yx_trend.loc[yx_trend.is_group==1,'mon'].astype(str)
  5. y1 = yx_trend.loc[yx_trend.is_group==1,'tsh']
  6. y2 = yx_trend.loc[yx_trend.is_group==0,'tsh']
  7. axes.plot(x, y1,color='coral', linestyle='-', marker='o', label='实验组')
  8. axes.plot(x, y2,color='cornflowerblue', linestyle='--', marker='o', label='对照组')
  9. # 取time变量为0,即匹配期的最大日期,作为分界线
  10. axes.axvline(x=matched_long_new.loc[matched_long_new.is_strategy_time==0].mon.max(), color='red', linestyle='--')
  11. axes.set_xlabel(None)
  12. axes.set_ylabel('TSH',rotation=0,labelpad=20)
  13. # axes.set_xticklabels(labels=[i.replace('2021-','') for i in x],rotation = 40)
  14. axes.legend()
  15. axes.set_title('%s:XXXXX效果by月'%city_name)
  16. plt.show()
  17. city_num=0
  18. for city_name in city_list :
  19. city_num=city_num+1
  20. "city_name=='"+city_name+"'"
  21. city_info = data.query("city_name=='"+city_name+"'")
  22. get_trend_plot(city_info,city_name)
  23. # if city_num == 2 :
  24. # break

若在策略上线前,实验组对照组趋势非常相似,则认为psm匹配效果很好,策略增量置信度

 策略上线前,实验组对照组趋势GAP不稳定,则认为PSM人群不够相似,策略的增量置信度低,建议重新进行人群匹配。

结语:

1、psm选择指标时,需要确保指标对AB两组人群是根据样本自身行为有关的,与AB两组样本是无关的。

2、在构建相似人群时,需要注意人群是否本身存在一定的主观意愿的差异,即用户成为A人群和B人群是否是随机事件。如果存在主观的差异,尽管很多行为指标被psm拉齐,但人群仍然是不同质的,潜在的对策略的影响可能不同,从而导致结果虚高或者虚低。

例如A组是全量天猫商家,B组是全量淘宝商家,在全量B中选择A相似人群。
        首先,用户成为A或着B,两组样本本身存在主管意愿的差异,已经具备了样本不同质,对策略的敏感度可能不同。

        其次,如果相似指标中包含 店铺评分指标,A组和B组虽然都有店铺评分,但A、B的评分体系并不相同,所以这类指标不适宜作为相似指标进行筛选。

        (例子不是很合适,我也不能讲我自身的业务,但相信大家可以get到我想说的是什么

本人的相关文章都是相对入门的一些数据分析方法,不会过多的讲解原理,更多的讲解怎么帮助业务解决问题。先在工作中用起来。

如果对你的工作有帮助,帮你节约了一定的时间成本,慷慨的打赏起来吧,一元两元也是爱~  

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/368425
推荐阅读
相关标签
  

闽ICP备14008679号