赞
踩
决策树(decision tree)是一个树结构(可以是二叉树或非二叉树)。
其每个非叶节点表示一个特征属性上的测试,每个分支代表这个特征属性在某个值域上的输出,而每个叶节点存放一个类别。
使用决策树进行决策的过程就是从根节点开始,测试待分类项中相应的特征属性,并按照其值选择输出分支,直到到达叶子节点,将叶子节点存放的类别作为决策结果。
总结来说:
决策树模型核心是下面几部分:
import numpy as np
import pandas as pd
file = "NBA2014.csv"
data = pd.read_csv(file)
data.iloc[:5]
# Don't read the first row,as it is blank. Parse the date column as a date
data = pd.read_csv(file,parse_dates=[0])
data.columns = ["Date","Start","Visitor Team","VisitorPts","Home Team","HomePts","Score Type","OT?","Attend","Notes"]
data.iloc[:5]
data["Home Win"] = data["VisitorPts"] < data["HomePts"]
y_true = data["Home Win"].values
data.iloc[:5]
print("Home Team Win Percentage: {0:.1f}%".format(np.mean(y_true)*100))
data["HomeLastWin"] = False data["VisitorLastWin"] = False data.iloc[:5] # create a dict to store the team last result from collections import defaultdict won_last = defaultdict(int) for index,row in data.iterrows(): home_team = row["Home Team"] visitor_team = row["Visitor Team"] row["HomeLastWin"] = won_last[home_team] row["VisitorLastWin"] = won_last[visitor_team] data.iloc[index] = row # set the current win won_last[home_team] = row["Home Win"] won_last[visitor_team] = not row["Home Win"] data.iloc[20:25]
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state = 14)
# create the dataset
X_win = data[["HomeLastWin","VisitorLastWin"]].values
scores = cross_val_score(clf,X_win,y_true,scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
import chardet
file = "NBA2013_expanded-standings.csv"
with open(file, 'rb') as f:
print(f)
result = chardet.detect(f.read()) # or readline if the file is large
standings = pd.read_csv(file,skiprows=[0],encoding=result['encoding'])
# create a new feature:HomeTeamRankHigher data["HomeTeamRankHigher"] = 0 for index,row in data.iterrows(): home_team = row["Home Team"] visitor_team = row["Visitor Team"] if home_team == "New Orleans Pelicans": home_team = "New Orleans Hornets" elif visitor_team == "New Orleans Pelicans": visitor_team = "New Orleans Hornets" home_rank = standings[standings["Team"] == home_team]['Rk'].values[0] visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0] row["HomeTeamRankHigher"] = int(home_rank > visitor_rank) data.iloc[index] = row data.iloc[:5] # create the train set X_homehigher = data[["HomeLastWin","VisitorLastWin","HomeTeamRankHigher"]].values clf = DecisionTreeClassifier(random_state=14) scores = cross_val_score(clf,X_homehigher,y_true,scoring = "accuracy") print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
# who won the last match last_match_winer = defaultdict(int) data["HomeTeamWonLast"] = 0 for index,row in data.iterrows(): home_team = row["Home Team"] visitor_team = row["Visitor Team"] #sort the team names teams = tuple(sorted([home_team,visitor_team])) # who won the last game row["HomeTeamWonLast"] = 1 if last_match_winer == row["Home Team"] else 0 data.iloc[index] = row winner = row["Home Team"] if row["Home Win"] else row["Visitor Team"] last_match_winer = winner data.iloc[:5] # create the dataset X_lastwinner = data[["HomeTeamRankHigher","HomeTeamWonLast"]].values clf = DecisionTreeClassifier(random_state=14) scores = cross_val_score(clf,X_lastwinner,y_true,scoring="accuracy") print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
# convert the string names to into integers from sklearn.preprocessing import LabelEncoder encoding = LabelEncoder() encoding.fit(data["Home Team"].values) home_teams = encoding.transform(data["Home Team"].values) visitor_teams = encoding.transform(data["Visitor Team"].values) X_teams = np.vstack([home_teams,visitor_teams]).T # encode these integers into a number if binary features from sklearn.preprocessing import OneHotEncoder onehot = OneHotEncoder() X_teams_expanded = onehot.fit_transform(X_teams).todense() clf = DecisionTreeClassifier(random_state=14) scores = cross_val_score(clf,X_teams_expanded,y_true,scoring="accuracy") print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
# use random_forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf,X_teams_expanded,y_true,scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
X_all = np.hstack([X_lastwinner,X_teams_expanded])
print("X_all shape: {0}".format(X_all.shape))
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf,X_all,y_true,scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores)*100))
from sklearn.grid_search import GridSearchCV #n_estimators=10, criterion='gini', max_depth=None, #min_samples_split=2, min_samples_leaf=1, #max_features='auto', #max_leaf_nodes=None, bootstrap=True, #oob_score=False, n_jobs=1, #random_state=None, verbose=0, min_density=None, compute_importances=None parameter_space = { "max_features": [2,10,'auto'], "n_estimators": [100,], "criterion": ["gini","entropy"], "min_samples_leaf": [2,4,6], } clf = RandomForestClassifier(random_state=14) grid = GridSearchCV(clf,parameter_space) grid.fit(X_all,y_true) print("Accuracy: {0:.1f}%".format(grid.best_score_ *100))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。