赞
踩
目录
可以得出:
从随机森林算法做出的特征重要性排序可以发现:
- import pandas as pd
- data1 = pd.read_csv(’../../user_info.csv’)
- data2 = pd.read_csv(’../../login_day.csv’)
- data3 = pd.read_csv(’../../ visit_info .csv’)
- data4 = pd.read_csv(’../../ result .csv’)
- print(data1. isnull () .any())
- print(data2. isnull () .any())
- print(data3. isnull () .any())
- print(data4. isnull () .any())
- # display result
- # user_id False
- # first_order_time False
- # first_order_price False
- # age_month False
- # city_num True
- # platform_num False
- # model_num False
- # app_num False
- # dtype: bool
- # user_id False
- # login_day False
- # login_diff_time False
- # distance_day False
- # login_time False
- # launch_time False
- # chinese_subscribe_num False
- # math_subscribe_num False
- # add_friend False
- # add_group False
- # camp_num False
- # learn_num False
- # finish_num False
- # study_num False
- # coupon False
-
- # course_order_num False
- # dtype: bool
- # user_id False
- # main_home False
- # main_home2 False
- # mainpage False
- # schoolreportpage False
- # main_mime False
- # lightcoursetab False
- # main_learnpark False
- # partnergamebarrierspage False
- # evaulationcenter False
- # coupon_visit False
- # click_buy False
- # progress_bar False
- # ppt False
- # task False
- # video_play False
- # video_read False
- # next_nize False
- # answer_task False
- # chapter_module False
- # course_tab False
- # slide_subscribe False
- # baby_info False
- # click_notunlocked False
- # share False
- # click_dialog False
- # dtype: bool
- # user_id False
- # result False
- # dtype: bool

- # -*-coding:utf-8-*-
- import pandas as pd
- data1 = pd.read_csv(’../user_info.csv’)
- print(data1[’user_id’ ]. count())
- wp1 = data1.drop_duplicates(subset=’user_id’,keep=False)
-
- print(wp1[’user_id’].count())
- wp1.to_csv(’../user_infoPretreatment.csv’)
- data2 = pd.read_csv(’../login_day.csv’)
- print(data2[’user_id’ ]. count())
- wp2 = data2.drop_duplicates(subset=’user_id’,keep=False)
- print(wp2[’user_id’].count())
- wp2.to_csv(’../login_dayPretreatment.csv’)
- data3 = pd.read_csv(’../visit_info.csv’)
- print(data3[’user_id’ ]. count())
- wp3 = data3.drop_duplicates(subset=’user_id’,keep=False)
- print(wp3[’user_id’].count())
- wp3.to_csv(’../visit_infoPretreatment.csv’)
- data4 = pd.read_csv(’../result.csv’)
- print(data4[’user_id’ ]. count())
- wp4 = data4.drop_duplicates(subset=[’user_id’],keep=False)
- print(wp4[’user_id’].count())
- wp4.to_csv(’../resultPretreatment.csv’)
- # display result
- # 135968
- # 116703
- # 135617
- # 116416
- # 135617
- # 116416
- # 4639
- # 4613

- # -*-coding:utf-8-*-
- import pandas as pd
- data1 = pd.read_csv(’../user_info_tobemerged.csv’)
- data2 = pd.read_csv(’../login_day_tobemerged.csv’)
- data3 = pd.read_csv(’../visit_info_tobemerged.csv’)
- data4 = pd.read_csv(’../result_tobemerged.csv’)
-
- result23 = pd.merge(data2, data3, on=[’user_id’])
- result23 .to_csv(’../merge23.csv’, encoding=’gbk’)
- result234 = pd.merge(result23, data4, on=[’user_id’], how=’left’)
- result234 = result234. fillna (0)
- result234.to_csv(’../merge234.csv’, encoding=’gbk’)
- result1234 = pd.merge(data1, result234, on=[’user_id’], how=’right’)
- result1234.to_csv(’../merge1234.csv’, encoding=’gbk’)
- import pandas as pd
- import numpy as np
- from sklearn. cluster import KMeans
- import matplotlib.pyplot as plt
- # ------ 1.导入数据 ------
- df = pd.read_csv(’china.csv’) # 此处注意换成自己的数据集路径
- #print(df.head()) # 展示前5行数据
- # ------ 2.提取经纬度数据 ------
- x = df
- x_np = np.array(x) # 将x转化为numpy数组
- # ------ 3.构造K-Means聚类器 ------
- n_clusters = 5 # 类簇的数量
- estimator = KMeans(n_clusters) # 构建聚类器
- # ------ 4.训练K-Means聚类器 ------
- estimator. fit (x)
- # ------ 5.数据可视化 ------
- markers = [’*’, ’v’, ’+’, ’^’, ’s’ , ’x’, ’o’ ] # 标记样式列表
- colors = [’r’ , ’g’ , ’m’, ’c’ , ’y’, ’b’, ’orange’] # 标记颜色列表
- labels = estimator.labels_ # 获取聚类标签
- plt . figure ( figsize =(9, 6))
- plt .xlabel(’East Longitude’, fontsize =18)
- plt .ylabel(’North Latitude’, fontsize =18)
-
- for i in range(n_clusters): # 遍历所有城市,绘制散点图
- members = labels == i # members是一个布尔型数组
- plt . scatter(
- x_np[members, 1], # 城市经度数组
- x_np[members, 0], # 城市纬度数组
- marker = markers[i], # 标记样式
- c = colors[ i ] # 标记颜色
- ) # 绘制散点图
- plt .grid()
- plt .show()

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。