赞
踩
前言:对中文文本分词、去除停用词等预处理操作
pwd
import jieba
import numpy as np
import pandas as pd
import csv
data_zh.xlsx
文件必须含有一列列名为text
,列中数据为若干篇文档
xlsx_file = pd.read_excel('./data_zh.xlsx')
csv_file = xlsx_file.to_csv('./data_zh.csv', encoding='utf-8')
csv_file=pd.read_csv('./data_zh.csv', low_memory=False, encoding='utf-8')
# 将csv中的数据读取为pandas dataframe结构
df_file=pd.DataFrame(csv_file)
# 显示前5行数据
df_file.head()
def format_str(document):
'''去除非中文字符'''
content_str=''
for _char in document:
if (_char >= u'\u4e00' and _char <= u'\u9fa5'):
content_str = content_str + _char
else:
continue
return content_str
# 读取csv文件中的text列,将所有文章构成一个list
document_list = list(df_file['text'])
chinese_list=[]
for document in document_list:
chinese_list.append(format_str(document))
chinese_list
自定义词典dict_my.txt的格式为:
词 3 n
自定义词典包含三部分:词语、词频、词性(可省略,n表示名词)
词频的理解:https://github.com/fxsjy/jieba/issues/14
jieba.load_userdict("./dict_my.txt")
word_lists=[]
for i in range(len(chinese_list)):
result=[]
seg_list=jieba.cut(chinese_list[i])
for word in seg_list:
result.append(word)
word_lists.append(result)
word_lists
停用词库stopwords_my.txt格式为:两个停用词间换行
word1
word2
word3
……
# 导入停用词
f=open("./stopwords_my.txt", 'r', encoding='utf-8')
stop_word_list=[]
for line in f.readlines():
stop_word_list.append(line.strip('\n'))
contents_clean = []
for word_list in word_lists:
line_clean = []
for word in word_list:
if word in stop_word_list:
continue
line_clean.append(word)
contents_clean.append(line_clean)
# 去除停用词后的文本 list
contents_clean
显示高频词,并手动将高频且无意义的词放置停用词库
# 将分此后的结果合并至一个list中
all_words = []
for line in contents_clean:
for word in line:
all_words.append(word)
all_words
# 统计词频
from collections import Counter
wordcount = Counter(all_words)
word_count = wordcount.most_common(50)
frequence_list = []
for i in range(len(word_count)):
frequence_list.append(word_count[i][0])
frequence_list
# 获取不在停用词库中的高频词
need_to_add_stopword = []
for i in frequence_list:
if i not in stop_word_list:
need_to_add_stopword.append(i)
else:
continue
# 输出不在停用词库中的高频词
print("\n".join(str(i) for i in need_to_add_stopword))
# 将以下输出结果中的无用词复制到文件 stopwords_my.txt 中
word_list=[]
for i in range(len(contents_clean)):
k_list=['']
for j in range(len(contents_clean[i])):
k_list[0] += (contents_clean[i][j] + ' ')
word_list.append(k_list[0])
word_list
word_data=pd.DataFrame({'text1':word_list})
word_data.head()
# 使用join函数拼接
data = df_file.join(word_data)
# 输出拼接后的结果
data.head()
# 重设df的列名,根据需求设置列名,‘jiebatext’为经过预处理的文本
data.columns=['order', 'title', 'text', 'jiebatext']
data.head()
# 删除不需要的列
del data['order']
# 查看最终的df
data.head()
经测试,R语言实现结构主体模型(STM)最好输入xlsx,因为csv会因为分隔符等问题而报错
data.to_csv(path_or_buf='./data_zh_done.csv', index=False, encoding='utf-8')
csv_file=pd.read_csv('./data_zh_done.csv', low_memory=False, encoding='utf-8')
csv_file.to_excel('./data_zh_done.xlsx', index=False, encoding='utf-8')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。