基于分类算法的学习失败预警【可视化】

作者：爱喝兽奶帝天荒 | 2024-07-25 03:29:38

踩

基于分类算法的学习失败预警

实验要求

能够对数据进行预处理；能够使用随机森林算法构建学习失败预警模型；能够通过网格搜索方法对随机森林算法各项参数进行优化；能够应用scikit-learn中的支持向量机、逻辑回归和AdaBoost算法进行对比试验。

对数据进行预处理；
处理数据不平衡；
样本生成及标准化处理；
使用随机森林算法构建模型；
结果分析与可视化；
特征重要性分析；
与其他算法比较，并计算不同算法的准确率、查全率、F1值和AUC指标；
对几种模型的结果进行分析。

代码部分：

main.py


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.svm import SVC
import tkinter as tk
from tkinter import Label, Button, Toplevel
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
from pages import show_chart
 
# 设置显示选项
pd.set_option('display.max_columns', 100)
 
# 数据加载与预处理
def load_and_preprocess_data():
    df = pd.read_csv('uwide.csv', encoding='utf-8')
    factor = pd.factorize(df['SEX']) #因子化，转换成整数编码
    df['SEX'] = factor[0]
    df = df.fillna(0)#缺失值替换为0
    df['ssate'] = np.where(df['TOTALSCORE'] >= 60, 1, 0) #新建ssate列，总成绩大于等于60的为1，否则为0
    df = df[['BROWSER_COUNT', 'COURSE_COUNT', 'COURSE_AVG_SCORE', 'EXAM_AH_SCORE', 'EXAM_WRITEN_SCORE', 'EXAM_MIDDLE_SCORE',
             'EXAM_LAB', 'EXAM_PROGRESS', 'EXAM_GROUP_SCORE', 'EXAM_FACE_SCORE', 'EXAM_ONLINE_SCORE', 'NODEBB_CHANNEL_COUNT',
             'NODEBB_TOPIC_COUNT', 'COURSE_SUM_VIDEO_LEN', 'SEX', 'GRADE', 'EXAM_HOMEWORK', 'EXAM_LABSCORE', 'EXAM_OTHERSCORE',
             'NODEBB_PARTICIPATIONRATE', 'COURSE_WORKTIME', 'COURSE_WORKCOMPLETERATE', 'NODEBB_POSTSCOUNT',
             'NODEBB_NORMALBBSPOSTSCOUONT', 'NODEBB_REALBBSARCHIVECOUNT', 'NORMALBBSARCHIVECOUNT', 'COURSE_WORKCOUNT',
             'HOMEWORKSCORE', 'WRITTENASSIGNMENTSCORE', 'MIDDLEASSIGNMENTSCORE', 'ssate']] #选择需要的特征和标签
    return df
 
def balance_data(df):#处理数据不平衡,下采样
    df_major = df[df.ssate == 1]
    df_minor = df[df.ssate == 0]
    df_major_down = df_major
    if len(df_major) > len(df_minor) * 8:
        new_major_count = len(df_minor) * 8
        df_major_down = resample(df_major, replace=False, n_samples=new_major_count, random_state=66)
    df_balanced = pd.concat([df_major_down, df_minor])
    return df_balanced
 
def split_and_scale_data(df):#划分训练集和测试集
    X = df.iloc[:, :-1].values
    Y = df.iloc[:, -1].values
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=27)
    sm = SMOTE(random_state=27)#合成少数过采样技术
    X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
    scaler = StandardScaler() #特征进行标准化，使它们具有均值为 0 和方差为 1
    X_train_res = scaler.fit_transform(X_train_res)#计算均值和方差后标准化
    X_test = scaler.transform(X_test)
    return X_train_res, X_test, Y_train_res, Y_test
 
# 模型训练与评价
def evaluate_model(clf, X_train, Y_train, X_test, Y_test):
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    Y_pred_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else Y_pred #计算每个样本属于每个类别的概率
    return {
        "model": clf,  # 返回训练好的模型对象
        "accuracy": accuracy_score(Y_test, Y_pred),
        "recall": recall_score(Y_test, Y_pred),#所有实际为正例的样本中，模型正确预测为正例的比例
        "precision": precision_score(Y_test, Y_pred),#所有模型预测为正例的样本中，实际为正例的比例
        "f1_score": f1_score(Y_test, Y_pred),#精确率和召回率的调和平均值
        "roc_auc": roc_auc_score(Y_test, Y_pred_proba)#模型预测结果的可靠性和有效性
    }
 
# 计算特征重要性
def calculate_feature_importance(clf, feature_names):
    importances = clf.feature_importances_ #随机森林训练后得到的特征重要性数组
    indices = np.argsort(importances)[::-1] #对特征重要性进行降序排序，并返回索引
    feature_importance = []
    for f in range(len(feature_names)):
        feature_importance.append((feature_names[indices[f]], importances[indices[f]]))
    return feature_importance
 
# 显示特征重要性图表
def visualize_feature_importance(feature_importances, parent_window=None):
    new_window = Toplevel()#顶级窗口对象
    new_window.title("Feature Importance Visualization")
    window_width = 1400
    window_height = 800
 
    screen_width = new_window.winfo_screenwidth()
    screen_height = new_window.winfo_screenheight()
 
    # 计算窗口左上角坐标使其居中
    window_x = (screen_width - window_width) // 2
    window_y = (screen_height - window_height) // 2 - 50
 
    new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
    new_window.configure(bg="AliceBlue")
    new_window.resizable(True, True)
 
 
 
    fig, ax = plt.subplots(figsize=(12, 8))  # 增加图形的高度和宽度
    y_pos = range(len(feature_importances))
    ax.barh(y_pos, [imp[1] for imp in feature_importances], align="center")#绘制水平条形图
    ax.set_yticks(y_pos)#设置 y 轴刻度位置
    ax.set_yticklabels([imp[0] for imp in feature_importances], fontsize=10)  # 调整字体大小
    ax.set_xlabel("Feature Importance")
    ax.set_ylabel("Feature")
    ax.set_title("Random Forest Feature Importance")
    plt.tight_layout()  # 确保图形不重叠
 
    canvas = FigureCanvasTkAgg(fig, master=new_window)#创建一个 FigureCanvasTkAgg 对象，将之前创建的 Matplotlib 图形 fig 嵌入到 new_window 主窗口中
    canvas.draw()#绘制
    canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)#expand = 1 允许控件在窗口的所有可用空间内扩展，以填充其父容器
 
    if parent_window:
        parent_window.attributes('-topmost', False)  # 取消主界面的顶层设置
 
    new_window.mainloop()
 
# 显示随机森林结果可视化
def visualize_rf_results(results, parent_window=None):
    new_window = Toplevel()
    new_window.title("Random Forest Model Results Visualization")
    window_width = 1400
    window_height = 900
 
    screen_width = new_window.winfo_screenwidth()
    screen_height = new_window.winfo_screenheight()
 
    # 计算窗口左上角坐标使其居中
    window_x = (screen_width - window_width) // 2
    window_y = (screen_height - window_height) // 2 - 50
 
    new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
    new_window.configure(bg="AliceBlue")
    new_window.resizable(False, False)
 
 
 
 
    fig, ax = plt.subplots(2, 2, figsize=(12, 10)) #创建了一个包含 2 行 2 列的子图网格
 
    metrics = ["accuracy", "recall", "precision", "f1_score"]
    for i, metric in enumerate(metrics):
        row, col = i // 2, i % 2 #计算位置
        ax[row, col].bar(["Random Forest"], [results["Random Forest"][metric]], width=0.4) #绘制柱状图
        ax[row, col].set_title(metric.capitalize(), fontsize=14) #标题字母大写
 
        # 显示数值，并设置精度
        for bar in ax[row, col].patches:#遍历当前子图中的每个柱状图条
            ax[row, col].annotate(f"{bar.get_height():.8f}", (bar.get_x() + bar.get_width() / 2, bar.get_height()),
                                  ha='center', va='center', xytext=(0, 5), textcoords='offset points')  #设置具体的数值，像素偏移方式
 
    plt.tight_layout()#自动调整子图参数
 
    canvas = FigureCanvasTkAgg(fig, master=new_window)
    canvas.draw()
    canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
 
    if parent_window:
        parent_window.attributes('-topmost', False)  # 取消主界面的顶层设置
 
    new_window.mainloop()
 
# 显示欢迎页面
def show_welcome_screen():
    welcome_window = tk.Tk()
    welcome_window.overrideredirect(True)#无边框的顶级窗口
 
    # 加载并显示图片
    welcome_photo = tk.PhotoImage(file='logo.png')
    label_photo = tk.Label(welcome_window, image=welcome_photo)
    label_photo.pack()  # 使用pack方法居中显示图片
    image_width = welcome_photo.width()
    image_height = welcome_photo.height()
 
    # 获取屏幕宽度和高度
    screen_width = welcome_window.winfo_screenwidth()
    screen_height = welcome_window.winfo_screenheight()
 
    # 设置窗口位置居中
    window_x = (screen_width - image_width) // 2
    window_y = (screen_height - image_height) // 2
    welcome_window.geometry(f"{image_width}x{image_height}+{window_x}+{window_y}")
 
    welcome_window.attributes('-topmost', True)
    welcome_window.after(2000, lambda: welcome_window.destroy())  # 2秒后销毁欢迎页面
 
    welcome_window.mainloop()
 
# 显示主页面
def show_main_window(results, X_train, feature_names):
    mainwindow = tk.Tk()
    mainwindow.title("主页面")
    window_width = 1200
    window_height = 600
 
    # 获取屏幕宽度和高度
    screen_width = mainwindow.winfo_screenwidth()
    screen_height = mainwindow.winfo_screenheight()
 
    # 计算窗口左上角坐标使其居中
    window_x = (screen_width - window_width) // 2
    window_y = (screen_height - window_height) // 2 - 50
 
    mainwindow.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
    mainwindow.configure(bg="AliceBlue")
    mainwindow.resizable(False, False)
 
    # 添加label控件
    l_main_1 = Label(mainwindow, text='本代码实现了随机森林、支持向量机、逻辑回归和AdaBoost算法', height=10, width=50,
                     bg="AliceBlue", font=('微软雅黑 20 bold'))
    l_main_1.place(x=165, y=-110)
    l_main_2 = Label(mainwindow, text='点击查看不同算法性能对比', height=5, width=20, bg="AliceBlue",
                     font=('微软雅黑 15 bold'))
    l_main_2.place(x=110, y=370)
    photo_xiajiantou = tk.PhotoImage(file='向下箭头.png')
    photo_xiajiantou = photo_xiajiantou.subsample(3, 3)
    l_main_3 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_xiajiantou)
    l_main_3.place(x=70, y=420)
    l_main_4 = Label(mainwindow, text='点击查看随机森林训练结果', height=5, width=20, bg="AliceBlue",
                     font=('微软雅黑 20 bold'))
    l_main_4.place(x=365, y=100)
    photo_rightjiantou = tk.PhotoImage(file='右箭头.png')
    photo_rightjiantou = photo_rightjiantou.subsample(3, 3)
    l_main_5 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_rightjiantou)
    l_main_5.place(x=715, y=160)
    l_main_6 = Label(mainwindow, text='特征重要性分析', height=5, width=20, bg="AliceBlue",
                     font=('微软雅黑 20 bold'))
    l_main_6.place(x=470, y=228)
    photo_leftjiantou = tk.PhotoImage(file='左箭头.png')
    photo_leftjiantou = photo_leftjiantou.subsample(3, 3)
    l_main_7 = Label(mainwindow, height=65, width=50, bg="AliceBlue", image=photo_leftjiantou)
    l_main_7.place(x=467, y=290)
 
    # 添加button控件
    button_data = [
        ("Accuracy", [results[model]["accuracy"] for model in results],40,500),
        ("Recall", [results[model]["recall"] for model in results],275,500),
        ("Precision", [results[model]["precision"] for model in results], 510,500),
        ("F1 Score", [results[model]["f1_score"] for model in results],745,500),
        ("ROC AUC", [results[model]["roc_auc"] for model in results],980,500),
        ("Feature Importance", lambda: visualize_feature_importance(
            calculate_feature_importance(results["Random Forest"]["model"], feature_names),  mainwindow),
         160, 290),
        ("RF Results", lambda: visualize_rf_results(results,mainwindow), 795, 160)
    ]
 
    for text, data, x, y in button_data:
        if callable(data):
            btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=16, command=data)
        else:
            btn = Button(mainwindow, text=text, font=('微软雅黑 20 bold'), height=1, width=9,
                         command=lambda d=data, t=text.lower(): show_chart(t, d,mainwindow))
        btn.place(x=x, y=y)
 
    mainwindow.attributes('-topmost', True)
    mainwindow.mainloop()
 
 
# 主函数
if __name__ == "__main__":
    df = load_and_preprocess_data()
    df_balanced = balance_data(df)
    X_train, X_test, Y_train, Y_test = split_and_scale_data(df_balanced)
 
    models = {
        "Random Forest": RandomForestClassifier(criterion='entropy', random_state=27, n_jobs=-1),
        "Support Vector Machine": SVC(probability=True, random_state=27),
        "Logistic Regression": LogisticRegression(),
        "AdaBoost": AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=500, learning_rate=0.5, algorithm='SAMME')
    }
 
    results = {}
    for name, clf in models.items():
        results[name] = evaluate_model(clf, X_train, Y_train, X_test, Y_test)
 
    feature_names = df.columns[:-1]
 
    show_welcome_screen()
    show_main_window(results, X_train, feature_names)

Pages.py


import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
import numpy as np
 
 
def show_chart(chart_type, shuju, parent_window=None):
    # 设置算法名称
    algorithms = ['Random Forest', 'Support Vector Machine', 'Logistic Regression', 'AdaBoost']
 
    # 设置柱状图的x轴位置和宽度
    x = np.arange(len(algorithms))
    width = 0.35
 
    # 创建Matplotlib图形
    fig, ax = plt.subplots(figsize=(10, 6))
 
    # 绘制柱状图
    rects1 = ax.bar(x, shuju, width, color=['gray', 'pink', 'orange', 'gold'])
 
    # 添加一些文本用于标签、标题和自定义x轴刻度标签
    ax.set_xlabel('Algorithms')
    ax.set_ylabel(chart_type)
    ax.set_title(f'Comparison of Different Algorithms {chart_type}')
    ax.set_xticks(x)
    ax.set_xticklabels(algorithms)
 
    # 修改图例颜色
    custom_legend = ax.legend([rects1], [chart_type], loc='lower center', fontsize=10, frameon=True, facecolor='white',
                              edgecolor='black')
    for text in custom_legend.get_texts():
        text.set_color('black')
 
    # 添加数据标签
    def autolabel(rects):
        """在每个条形图上方添加文本标签"""
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom')
 
    autolabel(rects1)
 
    # 自动调整子图布局
    fig.tight_layout()
 
    # 创建Tkinter应用程序窗口
    root = tk.Tk()
    root.title(f"算法{chart_type}对比")
 
    window_width = 1200
    window_height = 600
 
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
 
    # 计算窗口左上角坐标使其居中
    window_x = (screen_width - window_width) // 2
    window_y = (screen_height - window_height) // 2 - 50
 
    root.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
    root.configure(bg="AliceBlue")
    root.resizable(False, False)
 
    # 创建Matplotlib的画布，并将其嵌入到Tkinter窗口中
    canvas = FigureCanvasTkAgg(fig, master=root)
    canvas.draw()
    canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
 
    # tk.attributes('-topmost', True)
 
    if parent_window:
        parent_window.attributes('-topmost', False)  # 取消主界面的顶层设置
 
    # 运行Tkinter主消息循环
    tk.mainloop()