赞
踩
能够对数据进行预处理;能够使用随机森林算法构建学习失败预警模型;能够通过网格搜索方法对随机森林算法各项参数进行优化;能够应用scikit-learn中的支持向量机、逻辑回归和AdaBoost算法进行对比试验。
- 对数据进行预处理;
- 处理数据不平衡;
- 样本生成及标准化处理;
- 使用随机森林算法构建模型;
- 结果分析与可视化;
- 特征重要性分析;
- 与其他算法比较,并计算不同算法的准确率、查全率、F1值和AUC指标;
- 对几种模型的结果进行分析。
- import pandas as pd
- import numpy as np
- from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, f1_score
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import StandardScaler
- from imblearn.over_sampling import SMOTE
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.utils import resample
- from sklearn.svm import SVC
- import tkinter as tk
- from tkinter import Label, Button, Toplevel
- from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
- import matplotlib.pyplot as plt
- from pages import show_chart
-
- # 设置显示选项
- pd.set_option('display.max_columns', 100)
-
- # 数据加载与预处理
- def load_and_preprocess_data():
- df = pd.read_csv('uwide.csv', encoding='utf-8')
- factor = pd.factorize(df['SEX']) #因子化,转换成整数编码
- df['SEX'] = factor[0]
- df = df.fillna(0)#缺失值替换为0
- df['ssate'] = np.where(df['TOTALSCORE'] >= 60, 1, 0) #新建ssate列,总成绩大于等于60的为1,否则为0
- df = df[['BROWSER_COUNT', 'COURSE_COUNT', 'COURSE_AVG_SCORE', 'EXAM_AH_SCORE', 'EXAM_WRITEN_SCORE', 'EXAM_MIDDLE_SCORE',
- 'EXAM_LAB', 'EXAM_PROGRESS', 'EXAM_GROUP_SCORE', 'EXAM_FACE_SCORE', 'EXAM_ONLINE_SCORE', 'NODEBB_CHANNEL_COUNT',
- 'NODEBB_TOPIC_COUNT', 'COURSE_SUM_VIDEO_LEN', 'SEX', 'GRADE', 'EXAM_HOMEWORK', 'EXAM_LABSCORE', 'EXAM_OTHERSCORE',
- 'NODEBB_PARTICIPATIONRATE', 'COURSE_WORKTIME', 'COURSE_WORKCOMPLETERATE', 'NODEBB_POSTSCOUNT',
- 'NODEBB_NORMALBBSPOSTSCOUONT', 'NODEBB_REALBBSARCHIVECOUNT', 'NORMALBBSARCHIVECOUNT', 'COURSE_WORKCOUNT',
- 'HOMEWORKSCORE', 'WRITTENASSIGNMENTSCORE', 'MIDDLEASSIGNMENTSCORE', 'ssate']] #选择需要的特征和标签
- return df
-
- def balance_data(df):#处理数据不平衡,下采样
- df_major = df[df.ssate == 1]
- df_minor = df[df.ssate == 0]
- df_major_down = df_major
- if len(df_major) > len(df_minor) * 8:
- new_major_count = len(df_minor) * 8
- df_major_down = resample(df_major, replace=False, n_samples=new_major_count, random_state=66)
- df_balanced = pd.concat([df_major_down, df_minor])
- return df_balanced
-
- def split_and_scale_data(df):#划分训练集和测试集
- X = df.iloc[:, :-1].values
- Y = df.iloc[:, -1].values
- X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=27)
- sm = SMOTE(random_state=27)#合成少数过采样技术
- X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
- scaler = StandardScaler() #特征进行标准化,使它们具有均值为 0 和方差为 1
- X_train_res = scaler.fit_transform(X_train_res)#计算均值和方差后标准化
- X_test = scaler.transform(X_test)
- return X_train_res, X_test, Y_train_res, Y_test
-
- # 模型训练与评价
- def evaluate_model(clf, X_train, Y_train, X_test, Y_test):
- clf.fit(X_train, Y_train)
- Y_pred = clf.predict(X_test)
- Y_pred_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else Y_pred #计算每个样本属于每个类别的概率
- return {
- "model": clf, # 返回训练好的模型对象
- "accuracy": accuracy_score(Y_test, Y_pred),
- "recall": recall_score(Y_test, Y_pred),#所有实际为正例的样本中,模型正确预测为正例的比例
- "precision": precision_score(Y_test, Y_pred),#所有模型预测为正例的样本中,实际为正例的比例
- "f1_score": f1_score(Y_test, Y_pred),#精确率和召回率的调和平均值
- "roc_auc": roc_auc_score(Y_test, Y_pred_proba)#模型预测结果的可靠性和有效性
- }
-
- # 计算特征重要性
- def calculate_feature_importance(clf, feature_names):
- importances = clf.feature_importances_ #随机森林训练后得到的特征重要性数组
- indices = np.argsort(importances)[::-1] #对特征重要性进行降序排序,并返回索引
- feature_importance = []
- for f in range(len(feature_names)):
- feature_importance.append((feature_names[indices[f]], importances[indices[f]]))
- return feature_importance
-
- # 显示特征重要性图表
- def visualize_feature_importance(feature_importances, parent_window=None):
- new_window = Toplevel()#顶级窗口对象
- new_window.title("Feature Importance Visualization")
- window_width = 1400
- window_height = 800
-
- screen_width = new_window.winfo_screenwidth()
- screen_height = new_window.winfo_screenheight()
-
- # 计算窗口左上角坐标使其居中
- window_x = (screen_width - window_width) // 2
- window_y = (screen_height - window_height) // 2 - 50
-
- new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
- new_window.configure(bg="AliceBlue")
- new_window.resizable(True, True)
-
-
-
- fig, ax = plt.subplots(figsize=(12, 8)) # 增加图形的高度和宽度
- y_pos = range(len(feature_importances))
- ax.barh(y_pos, [imp[1] for imp in feature_importances], align="center")#绘制水平条形图
- ax.set_yticks(y_pos)#设置 y 轴刻度位置
- ax.set_yticklabels([imp[0] for imp in feature_importances], fontsize=10) # 调整字体大小
- ax.set_xlabel("Feature Importance")
- ax.set_ylabel("Feature")
- ax.set_title("Random Forest Feature Importance")
- plt.tight_layout() # 确保图形不重叠
-
- canvas = FigureCanvasTkAgg(fig, master=new_window)#创建一个 FigureCanvasTkAgg 对象,将之前创建的 Matplotlib 图形 fig 嵌入到 new_window 主窗口中
- canvas.draw()#绘制
- canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)#expand = 1 允许控件在窗口的所有可用空间内扩展,以填充其父容器
-
- if parent_window:
- parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
-
- new_window.mainloop()
-
- # 显示随机森林结果可视化
- def visualize_rf_results(results, parent_window=None):
- new_window = Toplevel()
- new_window.title("Random Forest Model Results Visualization")
- window_width = 1400
- window_height = 900
-
- screen_width = new_window.winfo_screenwidth()
- screen_height = new_window.winfo_screenheight()
-
- # 计算窗口左上角坐标使其居中
- window_x = (screen_width - window_width) // 2
- window_y = (screen_height - window_height) // 2 - 50
-
- new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
- new_window.configure(bg="AliceBlue")
- new_window.resizable(False, False)
-
-
-
-
- fig, ax = plt.subplots(2, 2, figsize=(12, 10)) #创建了一个包含 2 行 2 列的子图网格
-
- metrics = ["accuracy", "recall", "precision", "f1_score"]
- for i, metric in enumerate(metrics):
- row, col = i // 2, i % 2 #计算位置
- ax[row, col].bar(["Random Forest"], [results["Random Forest"][metric]], width=0.4) #绘制柱状图
- ax[row, col].set_title(metric.capitalize(), fontsize=14) #标题字母大写
-
- # 显示数值,并设置精度
- for bar in ax[row, col].patches:#遍历当前子图中的每个柱状图条
- ax[row, col].annotate(f"{bar.get_height():.8f}", (bar.get_x() + bar.get_width() / 2, bar.get_height()),
- ha='center', va='center', xytext=(0, 5), textcoords='offset points') #设置具体的数值,像素偏移方式
-
- plt.tight_layout()#自动调整子图参数
-
- canvas = FigureCanvasTkAgg(fig, master=new_window)
- canvas.draw()
- canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
-
- if parent_window:
- parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
-
- new_window.mainloop()
-
- # 显示欢迎页面
- def show_welcome_screen():
- welcome_window = tk.Tk()
- welcome_window.overrideredirect(True)#无边框的顶级窗口
-
- # 加载并显示图片
- welcome_photo = tk.PhotoImage(file='logo.png')
- label_photo = tk.Label(welcome_window, image=welcome_photo)
- label_photo.pack() # 使用pack方法居中显示图片
- image_width = welcome_photo.width()
- image_height = welcome_photo.height()
-
- # 获取屏幕宽度和高度
- screen_width = welcome_window.winfo_screenwidth()
- screen_height = welcome_window.winfo_screenheight()
-
- # 设置窗口位置居中
- window_x = (screen_width - image_width) // 2
- window_y = (screen_height - image_height) // 2
- welcome_window.geometry(f"{image_width}x{image_height}+{window_x}+{window_y}")
-
- welcome_window.attributes('-topmost', True)
- welcome_window.after(2000, lambda: welcome_window.destroy()) # 2秒后销毁欢迎页面
-
- welcome_window.mainloop()
-
- # 显示主页面
- def show_main_window(results, X_train, feature_names):
- mainwindow = tk.Tk()
- mainwindow.title("主页面")
- window_width = 1200
- window_height = 600
-
- # 获取屏幕宽度和高度
- screen_width = mainwindow.winfo_screenwidth()
- screen_height = mainwindow.winfo_screenheight()
-
- # 计算窗口左上角坐标使其居中
- window_x = (screen_width - window_width) // 2
- window_y = (screen_height - window_height) // 2 - 50
-
- mainwindow.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
- mainwindow.configure(bg="AliceBlue")
- mainwindow.resizable(False, False)
-
- # 添加label控件
- l_main_1 = Label(mainwindow, text='本代码实现了随机森林、支持向量机、逻辑回归和AdaBoost算法', height=10, width=50,
- bg="AliceBlue", font=('微软雅黑 20 bold'))
- l_main_1.place(x=165, y=-110)
- l_main_2 = Label(mainwindow, text='点击查看不同算法性能对比', height=5, width=20, bg="AliceBlue",
- font=('微软雅黑 15 bold'))
- l_main_2.place(x=110, y=370)
- photo_xiajiantou = tk.PhotoImage(file='向下箭头.png')
- photo_xiajiantou = photo_xiajiantou.subsample(3, 3)
- l_main_3 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_xiajiantou)
- l_main_3.place(x=70, y=420)
- l_main_4 = Label(mainwindow, text='点击查看随机森林训练结果', height=5, width=20, bg="AliceBlue",
- font=('微软雅黑 20 bold'))
- l_main_4.place(x=365, y=100)
- photo_rightjiantou = tk.PhotoImage(file='右箭头.png')
- photo_rightjiantou = photo_rightjiantou.subsample(3, 3)
- l_main_5 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_rightjiantou)
- l_main_5.place(x=715, y=160)
- l_main_6 = Label(mainwindow, text='特征重要性分析', height=5, width=20, bg="AliceBlue",
- font=('微软雅黑 20 bold'))
- l_main_6.place(x=470, y=228)
- photo_leftjiantou = tk.PhotoImage(file='左箭头.png')
- photo_leftjiantou = photo_leftjiantou.subsample(3, 3)
- l_main_7 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_leftjiantou)
- l_main_7.place(x=467, y=290)
-
- # 添加button控件
- button_data = [
- ("Accuracy", [results[model]["accuracy"] for model in results],40,500),
- ("Recall", [results[model]["recall"] for model in results],275,500),
- ("Precision", [results[model]["precision"] for model in results], 510,500),
- ("F1 Score", [results[model]["f1_score"] for model in results],745,500),
- ("ROC AUC", [results[model]["roc_auc"] for model in results],980,500),
- ("Feature Importance", lambda: visualize_feature_importance(
- calculate_feature_importance(results["Random Forest"]["model"], feature_names), mainwindow),
- 160, 290),
- ("RF Results", lambda: visualize_rf_results(results,mainwindow), 795, 160)
- ]
-
- for text, data, x, y in button_data:
- if callable(data):
- btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=16, command=data)
- else:
- btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=9,
- command=lambda d=data, t=text.lower(): show_chart(t, d,mainwindow))
- btn.place(x=x, y=y)
-
- mainwindow.attributes('-topmost', True)
- mainwindow.mainloop()
-
-
- # 主函数
- if __name__ == "__main__":
- df = load_and_preprocess_data()
- df_balanced = balance_data(df)
- X_train, X_test, Y_train, Y_test = split_and_scale_data(df_balanced)
-
- models = {
- "Random Forest": RandomForestClassifier(criterion='entropy', random_state=27, n_jobs=-1),
- "Support Vector Machine": SVC(probability=True, random_state=27),
- "Logistic Regression": LogisticRegression(),
- "AdaBoost": AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=0.5, algorithm='SAMME')
- }
-
- results = {}
- for name, clf in models.items():
- results[name] = evaluate_model(clf, X_train, Y_train, X_test, Y_test)
-
- feature_names = df.columns[:-1]
-
- show_welcome_screen()
- show_main_window(results, X_train, feature_names)
- import tkinter as tk
- from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
- import matplotlib.pyplot as plt
- import numpy as np
-
-
- def show_chart(chart_type, shuju, parent_window=None):
- # 设置算法名称
- algorithms = ['Random Forest', 'Support Vector Machine', 'Logistic Regression', 'AdaBoost']
-
- # 设置柱状图的x轴位置和宽度
- x = np.arange(len(algorithms))
- width = 0.35
-
- # 创建Matplotlib图形
- fig, ax = plt.subplots(figsize=(10, 6))
-
- # 绘制柱状图
- rects1 = ax.bar(x, shuju, width, color=['gray', 'pink', 'orange', 'gold'])
-
- # 添加一些文本用于标签、标题和自定义x轴刻度标签
- ax.set_xlabel('Algorithms')
- ax.set_ylabel(chart_type)
- ax.set_title(f'Comparison of Different Algorithms {chart_type}')
- ax.set_xticks(x)
- ax.set_xticklabels(algorithms)
-
- # 修改图例颜色
- custom_legend = ax.legend([rects1], [chart_type], loc='lower center', fontsize=10, frameon=True, facecolor='white',
- edgecolor='black')
- for text in custom_legend.get_texts():
- text.set_color('black')
-
- # 添加数据标签
- def autolabel(rects):
- """在每个条形图上方添加文本标签"""
- for rect in rects:
- height = rect.get_height()
- ax.annotate('{}'.format(height),
- xy=(rect.get_x() + rect.get_width() / 2, height),
- xytext=(0, 3),
- textcoords="offset points",
- ha='center', va='bottom')
-
- autolabel(rects1)
-
- # 自动调整子图布局
- fig.tight_layout()
-
- # 创建Tkinter应用程序窗口
- root = tk.Tk()
- root.title(f"算法{chart_type}对比")
-
- window_width = 1200
- window_height = 600
-
- screen_width = root.winfo_screenwidth()
- screen_height = root.winfo_screenheight()
-
- # 计算窗口左上角坐标使其居中
- window_x = (screen_width - window_width) // 2
- window_y = (screen_height - window_height) // 2 - 50
-
- root.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
- root.configure(bg="AliceBlue")
- root.resizable(False, False)
-
- # 创建Matplotlib的画布,并将其嵌入到Tkinter窗口中
- canvas = FigureCanvasTkAgg(fig, master=root)
- canvas.draw()
- canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
-
- # tk.attributes('-topmost', True)
-
- if parent_window:
- parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
-
- # 运行Tkinter主消息循环
- tk.mainloop()
1.启动页面
2.主页面
3.随机森林算法运行结果
4.特征重要性分析
5.不同算法比较-准确率
6.不同算法比较-精准率
7.不同算法比较-召回率
8.不同算法比较-F1值
9.不同算法比较-ROC AUC
代码打包:
我用夸克网盘分享了「study_fail_warning.zip」,点击链接即可保存。打开「夸克APP」,无需下载在线播放视频,畅享原画5倍速,支持电视投屏。
链接:https://pan.quark.cn/s/d0bc7e0b587a
提取码:MaxM
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。