当前位置:   article > 正文

【NLP项目-文本分类】划分测试集,训练集,验证集_文本分类测试数据

文本分类测试数据


本篇文章的主要任务是将自己的数据集使用在Chinese-Text-Classification-PyTorch项目中
github地址: Chinese-Text-Classification

数据集:二分类的文本数据,做情感分析,review为评论内容,label分为1,0正负项。

一、不分词划分数据集

用pandas读取csv数据文件,用sklearn中的train_test_split函数划分数据集

1.划分数据集

按8:1:1比例将数据集划分成训练集,验证集,测试集:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=0.5,
                                                    shuffle=True,
                                                    stratify=y_test,
                                                    random_state=42)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

2.将各数据集写入txt文件

生成三个txt文件,test.txt,train.txt,dev.txt 用于匹配替换项目数据集格式

# 划分成txt文件
testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"

print(X_test)
print(y_test)
with open(testdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_test,y_test):
        f.write(str(i)+'\t'+str(j)+'\n')

with open(traindir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_train,y_train):
        f.write(str(i)+'\t'+str(j)+'\n')

with open(validdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_valid,y_valid):
        f.write(str(i)+'\t'+str(j)+'\n')

f.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

完整代码:

import pandas as pd
import jieba
from sklearn.model_selection import train_test_split

data = pd.read_csv(r'D:\Study\PycahrmProjects\sentimentAnalysis\wb_data1_denote1.csv', encoding='utf-8-sig')

X = data['review'].values
y = data.label.values

# 5:3:2
# 8:1:1
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=0.5,
                                                    shuffle=True,
                                                    stratify=y_test,
                                                    random_state=42)

print("训练集样本数 = ", len(y_train))
print("训练集中正样本数 = ", len([w for w in y_train if w == 1]))
print("训练集中负样本数 = ", len([w for w in y_train if w == 0]))
print("验证集样本数 = ", len(y_valid))
print("验证集中正样本数 = ", len([w for w in y_valid if w == 1]))
print("验证集中负样本数 = ", len([w for w in y_valid if w == 0]))
print("测试集样本数 = ", len(y_test))
print("测试集中正样本数 = ", len([w for w in y_test if w == 1]))
print("测试集中负样本数 = ", len([w for w in y_test if w == 0]))

# 划分成txt文件
testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"

print(X_test)
print(y_test)
with open(testdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_test,y_test):
        f.write(str(i)+'\t'+str(j)+'\n')

with open(traindir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_train,y_train):
        f.write(str(i)+'\t'+str(j)+'\n')

with open(validdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_valid,y_valid):
        f.write(str(i)+'\t'+str(j)+'\n')

f.close()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56

二、分词划分数据集

1.分词

注意分词后需要按行划分成列表
代码如下(示例):

# 用jieba对各数据集分词
def tokenizer(data):
    # 得到文本数据
    text = []
    for i in range(data.shape[0]):
        text.append(str(data[i]))

    comment = '\n'.join(text)

    # 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等
    import re
    symbols = "[0-9\!\%\,\。\.\,\、\~\?\(\)\(\)\?\!\“\”\:\:\;\"\"\;\……&\-\_\|\.\A.B.C\*\^]"
    comments = re.sub(symbols, '', comment)

    comments_list = jieba.cut(comments)  # 精确模式
    # comments_list = jieba.cut_for_search(comments)#搜索引擎模式
    x_train = ' '.join([x for x in comments_list])  # 用空格连接分好的词

    return x_train

# 对各数据集分词

X_test = tokenizer(X_test)
X_train = tokenizer(X_train)
X_valid = tokenizer(X_valid)

# 按行将string划分成列表
X_valid = X_valid.split('\n')
X_test = X_test.split('\n')
X_train = X_train.split('\n')

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31

2.完整代码

import pandas as pd
import jieba
from sklearn.model_selection import train_test_split

# 用jieba对各数据集分词
def tokenizer(data):
    # 得到文本数据
    text = []
    for i in range(data.shape[0]):
        text.append(str(data[i]))

    comment = '\n'.join(text)

    # 清洗文本数据-用正则表达式删去数字、字母、标点符号、特殊符号等
    import re
    symbols = "[0-9\!\%\,\。\.\,\、\~\?\(\)\(\)\?\!\“\”\:\:\;\"\"\;\……&\-\_\|\.\A.B.C\*\^]"
    comments = re.sub(symbols, '', comment)

    comments_list = jieba.cut(comments)  # 精确模式
    # comments_list = jieba.cut_for_search(comments)#搜索引擎模式
    x_train = ' '.join([x for x in comments_list])  # 用空格连接分好的词

    return x_train


data = pd.read_csv(r'D:\Study\PycahrmProjects\sentimentAnalysis\wb_data1_denote1.csv', encoding='utf-8-sig')


X = data['review'].values
y = data.label.values

# 5:3:2
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y_test,
                                                    random_state=42)

print("训练集样本数 = ", len(y_train))
print("训练集中正样本数 = ", len([w for w in y_train if w == 1]))
print("训练集中负样本数 = ", len([w for w in y_train if w == 0]))
print("验证集样本数 = ", len(y_valid))
print("验证集中正样本数 = ", len([w for w in y_valid if w == 1]))
print("验证集中负样本数 = ", len([w for w in y_valid if w == 0]))
print("测试集样本数 = ", len(y_test))
print("测试集中正样本数 = ", len([w for w in y_test if w == 1]))
print("测试集中负样本数 = ", len([w for w in y_test if w == 0]))

# 划分成txt文件

testdir = "./WeiboData/data/test.txt"
traindir = "./WeiboData/data/train.txt"
validdir = "./WeiboData/data/dev.txt"

print(X_test)
print(y_test)

# 对各数据集分词

X_test = tokenizer(X_test)
X_train = tokenizer(X_train)
X_valid = tokenizer(X_valid)

X_valid = X_valid.split('\n')
X_test = X_test.split('\n')
X_train = X_train.split('\n')


print(X_test)
print(type(X_test))
print(len(X_test))

with open(testdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_test,y_test):
        f.write(str(i)+'\t'+str(j)+'\n')


with open(traindir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_train,y_train):
        f.write(str(i)+'\t'+str(j)+'\n')


with open(validdir, 'a+', encoding='utf-8-sig') as f:
    for i,j in zip(X_valid,y_valid):
        f.write(str(i)+'\t'+str(j)+'\n')

f.close()


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/爱喝兽奶帝天荒/article/detail/795748
推荐阅读
相关标签
  

闽ICP备14008679号