赞
踩
在网上找了一些KDD CUP99数据预处理的代码,结合自己写的一些。主要包括数值化和归一化。
参考原博客:https://blog.csdn.net/qq_35733521/article/details/87889480?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-11.nonecase
import csv import pandas as pd import numpy as np def get_col_types(): protocol_type = ['icmp', 'tcp', 'udp'] service_type = ['IRC', 'X11', 'Z39_50', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'hostnames', 'http', 'http_443', 'icmp', 'imap4', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois','http_8001','aol','http_2784', 'harvest'] flag_type = ['OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH'] train_label_type = ['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.', 'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.', 'warezclient.', 'warezmaster.'] test_label_type = ['apache2.', 'back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', 'httptunnel.', 'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'mailbomb.', 'mscan.', 'multihop.', 'named.', 'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.', 'portsweep.', 'processtable.', 'ps.', 'rootkit.', 'saint.', 'satan.', 'sendmail.', 'smurf.', 'snmpgetattack.', 'snmpguess.', 'sqlattack.', 'teardrop.', 'udpstorm.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.', 'xterm.'] label_type = [['normal.'], ['ipsweep.', 'mscan.', 'nmap.', 'portsweep.', 'saint.', 'satan.'], ['apache2.', 'back.', 'land.', 'mailbomb.', 'neptune.', 'pod.', 'processtable.', 'smurf.', 'teardrop.', 'udpstorm.'], ['buffer_overflow.', 'httptunnel.', 'loadmodule.', 'perl.', 'ps.', 'rootkit.', 'sqlattack.', 'xterm.'], ['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'named.', 'phf.', 'sendmail.', 'snmpgetattack.', 'snmpguess.', 'spy.', 'warezclient.', 'warezmaster.', 'worm.', 'xlock.', 'xsnoop.']] return protocol_type,service_type,flag_type,label_type label_type_1=['normal.'] def handle_data(): protocol_type,service_type,flag_type,label_type = get_col_types() source_file = 'KDDTrain+.csv' handled_file = 'KDDTrain+number.csv' # write to csv file data_file = open(handled_file, 'w', newline='') csv_writer = csv.writer(data_file) with open(source_file, 'r') as data_source: csv_reader = csv.reader(data_source) for row in csv_reader: row[1] = protocol_type.index(row[1]) row[2] = service_type.index(row[2]) row[3] = flag_type.index(row[3]) for labels in label_type: if labels.count(row[-1])>0: row[-1] = label_type.index(labels) csv_writer.writerow(row) data_file.close() test_source_file = 'KDDTest+.csv' test_handled_file = 'KDDTest+number.csv' # write to csv file test_data_file = open(test_handled_file, 'w', newline='') test_csv_writer = csv.writer(test_data_file) with open(test_source_file, 'r') as data_source: csv_reader = csv.reader(data_source) for row in csv_reader: row[1] = protocol_type.index(row[1]) row[2] = service_type.index(row[2]) row[3] = flag_type.index(row[3]) for labels in label_type: if labels.count(row[-1]) > 0: row[-1] = label_type.index(labels) test_csv_writer.writerow(row) test_data_file.close() print('pre process completed!')
接下来是数据归一化,有的博客提到不需要归一化,不影响数据分类,大家可以试试。在这里,也给出接着数值化后进行归一化的操作。
import csv
import numpy as np
my_matrix = np.loadtxt(open('KDDTrain+number.csv','r'),delimiter=",",skiprows=0)
#print(my_matrix)
mmax=np.max(my_matrix,axis=0)
mmin=np.min(my_matrix,axis=0)
for i in range(len(mmax)):
if mmax[i]==mmin[i]:
mmax[i]+=0.000001
res=(my_matrix-mmin)/(mmax-mmin)
#print(res)
with open("KDDTrain-last.csv", "w+",newline='') as csvfile:
writer = csv.writer(csvfile)
# 写入多行用writerows
writer.writerows(res)
归一化不需要处理标签,可以手动去除,也可以体现在代码中。
至此,数据预处理得到两个csv,数值化和归一化两个csv文件。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。