当前位置:   article > 正文

基于分类算法的学习失败预警【可视化】

基于分类算法的学习失败预警

实验要求

能够对数据进行预处理;能够使用随机森林算法构建学习失败预警模型;能够通过网格搜索方法对随机森林算法各项参数进行优化;能够应用scikit-learn中的支持向量机、逻辑回归和AdaBoost算法进行对比试验。

  1. 对数据进行预处理;
  2. 处理数据不平衡;
  3. 样本生成及标准化处理;
  4. 使用随机森林算法构建模型;
  5. 结果分析与可视化
  6. 特征重要性分析;
  7. 与其他算法比较,并计算不同算法的准确率、查全率、F1值和AUC指标;
  8. 对几种模型的结果进行分析。

 代码部分:

   main.py
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
  4. from sklearn.linear_model import LogisticRegression
  5. from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, f1_score
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.preprocessing import StandardScaler
  8. from imblearn.over_sampling import SMOTE
  9. from sklearn.tree import DecisionTreeClassifier
  10. from sklearn.utils import resample
  11. from sklearn.svm import SVC
  12. import tkinter as tk
  13. from tkinter import Label, Button, Toplevel
  14. from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
  15. import matplotlib.pyplot as plt
  16. from pages import show_chart
  17. # 设置显示选项
  18. pd.set_option('display.max_columns', 100)
  19. # 数据加载与预处理
  20. def load_and_preprocess_data():
  21. df = pd.read_csv('uwide.csv', encoding='utf-8')
  22. factor = pd.factorize(df['SEX']) #因子化,转换成整数编码
  23. df['SEX'] = factor[0]
  24. df = df.fillna(0)#缺失值替换为0
  25. df['ssate'] = np.where(df['TOTALSCORE'] >= 60, 1, 0) #新建ssate列,总成绩大于等于60的为1,否则为0
  26. df = df[['BROWSER_COUNT', 'COURSE_COUNT', 'COURSE_AVG_SCORE', 'EXAM_AH_SCORE', 'EXAM_WRITEN_SCORE', 'EXAM_MIDDLE_SCORE',
  27. 'EXAM_LAB', 'EXAM_PROGRESS', 'EXAM_GROUP_SCORE', 'EXAM_FACE_SCORE', 'EXAM_ONLINE_SCORE', 'NODEBB_CHANNEL_COUNT',
  28. 'NODEBB_TOPIC_COUNT', 'COURSE_SUM_VIDEO_LEN', 'SEX', 'GRADE', 'EXAM_HOMEWORK', 'EXAM_LABSCORE', 'EXAM_OTHERSCORE',
  29. 'NODEBB_PARTICIPATIONRATE', 'COURSE_WORKTIME', 'COURSE_WORKCOMPLETERATE', 'NODEBB_POSTSCOUNT',
  30. 'NODEBB_NORMALBBSPOSTSCOUONT', 'NODEBB_REALBBSARCHIVECOUNT', 'NORMALBBSARCHIVECOUNT', 'COURSE_WORKCOUNT',
  31. 'HOMEWORKSCORE', 'WRITTENASSIGNMENTSCORE', 'MIDDLEASSIGNMENTSCORE', 'ssate']] #选择需要的特征和标签
  32. return df
  33. def balance_data(df):#处理数据不平衡,下采样
  34. df_major = df[df.ssate == 1]
  35. df_minor = df[df.ssate == 0]
  36. df_major_down = df_major
  37. if len(df_major) > len(df_minor) * 8:
  38. new_major_count = len(df_minor) * 8
  39. df_major_down = resample(df_major, replace=False, n_samples=new_major_count, random_state=66)
  40. df_balanced = pd.concat([df_major_down, df_minor])
  41. return df_balanced
  42. def split_and_scale_data(df):#划分训练集和测试集
  43. X = df.iloc[:, :-1].values
  44. Y = df.iloc[:, -1].values
  45. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=27)
  46. sm = SMOTE(random_state=27)#合成少数过采样技术
  47. X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
  48. scaler = StandardScaler() #特征进行标准化,使它们具有均值为 0 和方差为 1
  49. X_train_res = scaler.fit_transform(X_train_res)#计算均值和方差后标准化
  50. X_test = scaler.transform(X_test)
  51. return X_train_res, X_test, Y_train_res, Y_test
  52. # 模型训练与评价
  53. def evaluate_model(clf, X_train, Y_train, X_test, Y_test):
  54. clf.fit(X_train, Y_train)
  55. Y_pred = clf.predict(X_test)
  56. Y_pred_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else Y_pred #计算每个样本属于每个类别的概率
  57. return {
  58. "model": clf, # 返回训练好的模型对象
  59. "accuracy": accuracy_score(Y_test, Y_pred),
  60. "recall": recall_score(Y_test, Y_pred),#所有实际为正例的样本中,模型正确预测为正例的比例
  61. "precision": precision_score(Y_test, Y_pred),#所有模型预测为正例的样本中,实际为正例的比例
  62. "f1_score": f1_score(Y_test, Y_pred),#精确率和召回率的调和平均值
  63. "roc_auc": roc_auc_score(Y_test, Y_pred_proba)#模型预测结果的可靠性和有效性
  64. }
  65. # 计算特征重要性
  66. def calculate_feature_importance(clf, feature_names):
  67. importances = clf.feature_importances_ #随机森林训练后得到的特征重要性数组
  68. indices = np.argsort(importances)[::-1] #对特征重要性进行降序排序,并返回索引
  69. feature_importance = []
  70. for f in range(len(feature_names)):
  71. feature_importance.append((feature_names[indices[f]], importances[indices[f]]))
  72. return feature_importance
  73. # 显示特征重要性图表
  74. def visualize_feature_importance(feature_importances, parent_window=None):
  75. new_window = Toplevel()#顶级窗口对象
  76. new_window.title("Feature Importance Visualization")
  77. window_width = 1400
  78. window_height = 800
  79. screen_width = new_window.winfo_screenwidth()
  80. screen_height = new_window.winfo_screenheight()
  81. # 计算窗口左上角坐标使其居中
  82. window_x = (screen_width - window_width) // 2
  83. window_y = (screen_height - window_height) // 2 - 50
  84. new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
  85. new_window.configure(bg="AliceBlue")
  86. new_window.resizable(True, True)
  87. fig, ax = plt.subplots(figsize=(12, 8)) # 增加图形的高度和宽度
  88. y_pos = range(len(feature_importances))
  89. ax.barh(y_pos, [imp[1] for imp in feature_importances], align="center")#绘制水平条形图
  90. ax.set_yticks(y_pos)#设置 y 轴刻度位置
  91. ax.set_yticklabels([imp[0] for imp in feature_importances], fontsize=10) # 调整字体大小
  92. ax.set_xlabel("Feature Importance")
  93. ax.set_ylabel("Feature")
  94. ax.set_title("Random Forest Feature Importance")
  95. plt.tight_layout() # 确保图形不重叠
  96. canvas = FigureCanvasTkAgg(fig, master=new_window)#创建一个 FigureCanvasTkAgg 对象,将之前创建的 Matplotlib 图形 fig 嵌入到 new_window 主窗口中
  97. canvas.draw()#绘制
  98. canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)#expand = 1 允许控件在窗口的所有可用空间内扩展,以填充其父容器
  99. if parent_window:
  100. parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
  101. new_window.mainloop()
  102. # 显示随机森林结果可视化
  103. def visualize_rf_results(results, parent_window=None):
  104. new_window = Toplevel()
  105. new_window.title("Random Forest Model Results Visualization")
  106. window_width = 1400
  107. window_height = 900
  108. screen_width = new_window.winfo_screenwidth()
  109. screen_height = new_window.winfo_screenheight()
  110. # 计算窗口左上角坐标使其居中
  111. window_x = (screen_width - window_width) // 2
  112. window_y = (screen_height - window_height) // 2 - 50
  113. new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
  114. new_window.configure(bg="AliceBlue")
  115. new_window.resizable(False, False)
  116. fig, ax = plt.subplots(2, 2, figsize=(12, 10)) #创建了一个包含 2 行 2 列的子图网格
  117. metrics = ["accuracy", "recall", "precision", "f1_score"]
  118. for i, metric in enumerate(metrics):
  119. row, col = i // 2, i % 2 #计算位置
  120. ax[row, col].bar(["Random Forest"], [results["Random Forest"][metric]], width=0.4) #绘制柱状图
  121. ax[row, col].set_title(metric.capitalize(), fontsize=14) #标题字母大写
  122. # 显示数值,并设置精度
  123. for bar in ax[row, col].patches:#遍历当前子图中的每个柱状图条
  124. ax[row, col].annotate(f"{bar.get_height():.8f}", (bar.get_x() + bar.get_width() / 2, bar.get_height()),
  125. ha='center', va='center', xytext=(0, 5), textcoords='offset points') #设置具体的数值,像素偏移方式
  126. plt.tight_layout()#自动调整子图参数
  127. canvas = FigureCanvasTkAgg(fig, master=new_window)
  128. canvas.draw()
  129. canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
  130. if parent_window:
  131. parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
  132. new_window.mainloop()
  133. # 显示欢迎页面
  134. def show_welcome_screen():
  135. welcome_window = tk.Tk()
  136. welcome_window.overrideredirect(True)#无边框的顶级窗口
  137. # 加载并显示图片
  138. welcome_photo = tk.PhotoImage(file='logo.png')
  139. label_photo = tk.Label(welcome_window, image=welcome_photo)
  140. label_photo.pack() # 使用pack方法居中显示图片
  141. image_width = welcome_photo.width()
  142. image_height = welcome_photo.height()
  143. # 获取屏幕宽度和高度
  144. screen_width = welcome_window.winfo_screenwidth()
  145. screen_height = welcome_window.winfo_screenheight()
  146. # 设置窗口位置居中
  147. window_x = (screen_width - image_width) // 2
  148. window_y = (screen_height - image_height) // 2
  149. welcome_window.geometry(f"{image_width}x{image_height}+{window_x}+{window_y}")
  150. welcome_window.attributes('-topmost', True)
  151. welcome_window.after(2000, lambda: welcome_window.destroy()) # 2秒后销毁欢迎页面
  152. welcome_window.mainloop()
  153. # 显示主页面
  154. def show_main_window(results, X_train, feature_names):
  155. mainwindow = tk.Tk()
  156. mainwindow.title("主页面")
  157. window_width = 1200
  158. window_height = 600
  159. # 获取屏幕宽度和高度
  160. screen_width = mainwindow.winfo_screenwidth()
  161. screen_height = mainwindow.winfo_screenheight()
  162. # 计算窗口左上角坐标使其居中
  163. window_x = (screen_width - window_width) // 2
  164. window_y = (screen_height - window_height) // 2 - 50
  165. mainwindow.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
  166. mainwindow.configure(bg="AliceBlue")
  167. mainwindow.resizable(False, False)
  168. # 添加label控件
  169. l_main_1 = Label(mainwindow, text='本代码实现了随机森林、支持向量机、逻辑回归和AdaBoost算法', height=10, width=50,
  170. bg="AliceBlue", font=('微软雅黑 20 bold'))
  171. l_main_1.place(x=165, y=-110)
  172. l_main_2 = Label(mainwindow, text='点击查看不同算法性能对比', height=5, width=20, bg="AliceBlue",
  173. font=('微软雅黑 15 bold'))
  174. l_main_2.place(x=110, y=370)
  175. photo_xiajiantou = tk.PhotoImage(file='向下箭头.png')
  176. photo_xiajiantou = photo_xiajiantou.subsample(3, 3)
  177. l_main_3 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_xiajiantou)
  178. l_main_3.place(x=70, y=420)
  179. l_main_4 = Label(mainwindow, text='点击查看随机森林训练结果', height=5, width=20, bg="AliceBlue",
  180. font=('微软雅黑 20 bold'))
  181. l_main_4.place(x=365, y=100)
  182. photo_rightjiantou = tk.PhotoImage(file='右箭头.png')
  183. photo_rightjiantou = photo_rightjiantou.subsample(3, 3)
  184. l_main_5 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_rightjiantou)
  185. l_main_5.place(x=715, y=160)
  186. l_main_6 = Label(mainwindow, text='特征重要性分析', height=5, width=20, bg="AliceBlue",
  187. font=('微软雅黑 20 bold'))
  188. l_main_6.place(x=470, y=228)
  189. photo_leftjiantou = tk.PhotoImage(file='左箭头.png')
  190. photo_leftjiantou = photo_leftjiantou.subsample(3, 3)
  191. l_main_7 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_leftjiantou)
  192. l_main_7.place(x=467, y=290)
  193. # 添加button控件
  194. button_data = [
  195. ("Accuracy", [results[model]["accuracy"] for model in results],40,500),
  196. ("Recall", [results[model]["recall"] for model in results],275,500),
  197. ("Precision", [results[model]["precision"] for model in results], 510,500),
  198. ("F1 Score", [results[model]["f1_score"] for model in results],745,500),
  199. ("ROC AUC", [results[model]["roc_auc"] for model in results],980,500),
  200. ("Feature Importance", lambda: visualize_feature_importance(
  201. calculate_feature_importance(results["Random Forest"]["model"], feature_names), mainwindow),
  202. 160, 290),
  203. ("RF Results", lambda: visualize_rf_results(results,mainwindow), 795, 160)
  204. ]
  205. for text, data, x, y in button_data:
  206. if callable(data):
  207. btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=16, command=data)
  208. else:
  209. btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=9,
  210. command=lambda d=data, t=text.lower(): show_chart(t, d,mainwindow))
  211. btn.place(x=x, y=y)
  212. mainwindow.attributes('-topmost', True)
  213. mainwindow.mainloop()
  214. # 主函数
  215. if __name__ == "__main__":
  216. df = load_and_preprocess_data()
  217. df_balanced = balance_data(df)
  218. X_train, X_test, Y_train, Y_test = split_and_scale_data(df_balanced)
  219. models = {
  220. "Random Forest": RandomForestClassifier(criterion='entropy', random_state=27, n_jobs=-1),
  221. "Support Vector Machine": SVC(probability=True, random_state=27),
  222. "Logistic Regression": LogisticRegression(),
  223. "AdaBoost": AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=0.5, algorithm='SAMME')
  224. }
  225. results = {}
  226. for name, clf in models.items():
  227. results[name] = evaluate_model(clf, X_train, Y_train, X_test, Y_test)
  228. feature_names = df.columns[:-1]
  229. show_welcome_screen()
  230. show_main_window(results, X_train, feature_names)
   Pages.py
  1. import tkinter as tk
  2. from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
  3. import matplotlib.pyplot as plt
  4. import numpy as np
  5. def show_chart(chart_type, shuju, parent_window=None):
  6. # 设置算法名称
  7. algorithms = ['Random Forest', 'Support Vector Machine', 'Logistic Regression', 'AdaBoost']
  8. # 设置柱状图的x轴位置和宽度
  9. x = np.arange(len(algorithms))
  10. width = 0.35
  11. # 创建Matplotlib图形
  12. fig, ax = plt.subplots(figsize=(10, 6))
  13. # 绘制柱状图
  14. rects1 = ax.bar(x, shuju, width, color=['gray', 'pink', 'orange', 'gold'])
  15. # 添加一些文本用于标签、标题和自定义x轴刻度标签
  16. ax.set_xlabel('Algorithms')
  17. ax.set_ylabel(chart_type)
  18. ax.set_title(f'Comparison of Different Algorithms {chart_type}')
  19. ax.set_xticks(x)
  20. ax.set_xticklabels(algorithms)
  21. # 修改图例颜色
  22. custom_legend = ax.legend([rects1], [chart_type], loc='lower center', fontsize=10, frameon=True, facecolor='white',
  23. edgecolor='black')
  24. for text in custom_legend.get_texts():
  25. text.set_color('black')
  26. # 添加数据标签
  27. def autolabel(rects):
  28. """在每个条形图上方添加文本标签"""
  29. for rect in rects:
  30. height = rect.get_height()
  31. ax.annotate('{}'.format(height),
  32. xy=(rect.get_x() + rect.get_width() / 2, height),
  33. xytext=(0, 3),
  34. textcoords="offset points",
  35. ha='center', va='bottom')
  36. autolabel(rects1)
  37. # 自动调整子图布局
  38. fig.tight_layout()
  39. # 创建Tkinter应用程序窗口
  40. root = tk.Tk()
  41. root.title(f"算法{chart_type}对比")
  42. window_width = 1200
  43. window_height = 600
  44. screen_width = root.winfo_screenwidth()
  45. screen_height = root.winfo_screenheight()
  46. # 计算窗口左上角坐标使其居中
  47. window_x = (screen_width - window_width) // 2
  48. window_y = (screen_height - window_height) // 2 - 50
  49. root.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
  50. root.configure(bg="AliceBlue")
  51. root.resizable(False, False)
  52. # 创建Matplotlib的画布,并将其嵌入到Tkinter窗口中
  53. canvas = FigureCanvasTkAgg(fig, master=root)
  54. canvas.draw()
  55. canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
  56. # tk.attributes('-topmost', True)
  57. if parent_window:
  58. parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
  59. # 运行Tkinter主消息循环
  60. tk.mainloop()

运行结果截图

1.启动页面

启动页面

2.主页面

3.随机森林算法运行结果

4.特征重要性分析

5.不同算法比较-准确率

6.不同算法比较-精准率

7.不同算法比较-召回率

8.不同算法比较-F1值

9.不同算法比较-ROC AUC

代码打包:

我用夸克网盘分享了「study_fail_warning.zip」,点击链接即可保存。打开「夸克APP」,无需下载在线播放视频,畅享原画5倍速,支持电视投屏。
链接:https://pan.quark.cn/s/d0bc7e0b587a
提取码:MaxM

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/878119
推荐阅读
相关标签
  

闽ICP备14008679号