赞
踩
本篇文章的主要任务是将自己的数据集使用在Chinese-Text-Classification-PyTorch项目中
github地址: Chinese-Text-Classification
数据集:二分类的文本数据,做情感分析,review为评论内容,label分为1,0正负项。
用pandas读取csv数据文件,用sklearn中的train_test_split函数划分数据集
按8:1:1比例将数据集划分成训练集,验证集,测试集:
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
shuffle=True,
stratify=y,
random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test,
y_test,
test_size=0.5,
shuffle=True,
stratify=y_test,
random_state=42)
生成三个txt文件,test.txt,train.txt,dev.txt 用于匹配替换项目数据集格式
# 划分成txt文件 testdir = "./WeiboData/data/test.txt" traindir = "./WeiboData/data/train.txt" validdir = "./WeiboData/data/dev.txt" print(X_test) print(y_test) with open(testdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_test,y_test): f.write(str(i)+'\t'+str(j)+'\n') with open(traindir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_train,y_train): f.write(str(i)+'\t'+str(j)+'\n') with open(validdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_valid,y_valid): f.write(str(i)+'\t'+str(j)+'\n') f.close()
完整代码:
import pandas as pd import jieba from sklearn.model_selection import train_test_split data = pd.read_csv(r'D:\Study\PycahrmProjects\sentimentAnalysis\wb_data1_denote1.csv', encoding='utf-8-sig') X = data['review'].values y = data.label.values # 5:3:2 # 8:1:1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42) X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, shuffle=True, stratify=y_test, random_state=42) print("训练集样本数 = ", len(y_train)) print("训练集中正样本数 = ", len([w for w in y_train if w == 1])) print("训练集中负样本数 = ", len([w for w in y_train if w == 0])) print("验证集样本数 = ", len(y_valid)) print("验证集中正样本数 = ", len([w for w in y_valid if w == 1])) print("验证集中负样本数 = ", len([w for w in y_valid if w == 0])) print("测试集样本数 = ", len(y_test)) print("测试集中正样本数 = ", len([w for w in y_test if w == 1])) print("测试集中负样本数 = ", len([w for w in y_test if w == 0])) # 划分成txt文件 testdir = "./WeiboData/data/test.txt" traindir = "./WeiboData/data/train.txt" validdir = "./WeiboData/data/dev.txt" print(X_test) print(y_test) with open(testdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_test,y_test): f.write(str(i)+'\t'+str(j)+'\n') with open(traindir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_train,y_train): f.write(str(i)+'\t'+str(j)+'\n') with open(validdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_valid,y_valid): f.write(str(i)+'\t'+str(j)+'\n') f.close()
注意分词后需要按行划分成列表
代码如下(示例):
# 用jieba对各数据集分词 def tokenizer(data): # 得到文本数据 text = [] for i in range(data.shape[0]): text.append(str(data[i])) comment = '\n'.join(text) # 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等 import re symbols = "[0-9\!\%\,\。\.\,\、\~\?\(\)\(\)\?\!\“\”\:\:\;\"\"\;\……&\-\_\|\.\A.B.C\*\^]" comments = re.sub(symbols, '', comment) comments_list = jieba.cut(comments) # 精确模式 # comments_list = jieba.cut_for_search(comments)#搜索引擎模式 x_train = ' '.join([x for x in comments_list]) # 用空格连接分好的词 return x_train # 对各数据集分词 X_test = tokenizer(X_test) X_train = tokenizer(X_train) X_valid = tokenizer(X_valid) # 按行将string划分成列表 X_valid = X_valid.split('\n') X_test = X_test.split('\n') X_train = X_train.split('\n')
import pandas as pd import jieba from sklearn.model_selection import train_test_split # 用jieba对各数据集分词 def tokenizer(data): # 得到文本数据 text = [] for i in range(data.shape[0]): text.append(str(data[i])) comment = '\n'.join(text) # 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等 import re symbols = "[0-9\!\%\,\。\.\,\、\~\?\(\)\(\)\?\!\“\”\:\:\;\"\"\;\……&\-\_\|\.\A.B.C\*\^]" comments = re.sub(symbols, '', comment) comments_list = jieba.cut(comments) # 精确模式 # comments_list = jieba.cut_for_search(comments)#搜索引擎模式 x_train = ' '.join([x for x in comments_list]) # 用空格连接分好的词 return x_train data = pd.read_csv(r'D:\Study\PycahrmProjects\sentimentAnalysis\wb_data1_denote1.csv', encoding='utf-8-sig') X = data['review'].values y = data.label.values # 5:3:2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, stratify=y, random_state=42) X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.3, shuffle=True, stratify=y_test, random_state=42) print("训练集样本数 = ", len(y_train)) print("训练集中正样本数 = ", len([w for w in y_train if w == 1])) print("训练集中负样本数 = ", len([w for w in y_train if w == 0])) print("验证集样本数 = ", len(y_valid)) print("验证集中正样本数 = ", len([w for w in y_valid if w == 1])) print("验证集中负样本数 = ", len([w for w in y_valid if w == 0])) print("测试集样本数 = ", len(y_test)) print("测试集中正样本数 = ", len([w for w in y_test if w == 1])) print("测试集中负样本数 = ", len([w for w in y_test if w == 0])) # 划分成txt文件 testdir = "./WeiboData/data/test.txt" traindir = "./WeiboData/data/train.txt" validdir = "./WeiboData/data/dev.txt" print(X_test) print(y_test) # 对各数据集分词 X_test = tokenizer(X_test) X_train = tokenizer(X_train) X_valid = tokenizer(X_valid) X_valid = X_valid.split('\n') X_test = X_test.split('\n') X_train = X_train.split('\n') print(X_test) print(type(X_test)) print(len(X_test)) with open(testdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_test,y_test): f.write(str(i)+'\t'+str(j)+'\n') with open(traindir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_train,y_train): f.write(str(i)+'\t'+str(j)+'\n') with open(validdir, 'a+', encoding='utf-8-sig') as f: for i,j in zip(X_valid,y_valid): f.write(str(i)+'\t'+str(j)+'\n') f.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。