赞
踩
import re
file_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r')
file_out = open(r'D:\works\文本分析\ge_children.txt','a')
for line in file_in.readlines():
if re.search(r'[Cc]hildren',line):
file_out.write(line)
file_in.close()
file_out.close()
import re
file_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r')
file_out = open(r'D:\works\文本分析\ge_tional.txt','a')
for line in file_in.readlines():
if re.search(r'\w+tional\b',line):
file_out.write(line+'\n')
file_in.close()
file_out.close()
# Recognizing_gsl_awl_words.py,part 1 # the following is to make the wordlist with freq # and store the info in a dictionary (wordlist_freq_dict) import re file_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r') all_words = [] for line in file_in.readlines(): line2 = line.lower() line3 = re.sub(r'\W',r' ',line2) #将非字母和数字部分替换为空格 wordlist = line3.split() for word in wordlist: all_words.append(word) wordlist_freq_dict = {} for word in all_words: if word in wordlist_freq_dict.keys(): wordlist_freq_dict[word] += 1 else: wordlist_freq_dict[word] = 1 file_in.close()
# Recognizing_gsl_awl_words.py,Part 2 # the following is to read the GSL and the AWL words # and save them in a dictionary gsl1000_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\GSL1000.txt','r') gsl2000_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\GSL2000.txt','r') awl_in = open(r'D:\works\文本分析\leopythonbookdata-master\texts\AWL.txt','r') gsl_awl_dict = {} for word in gsl1000_in.readlines(): gsl_awl_dict[word.strip()] = 1 for word in gsl2000_in.readlines(): gsl_awl_dict[word.strip()] = 2 for word in awl_in.readlines(): gsl_awl_dict[word.strip()] = 3 gsl1000_in.close() gsl2000_in.close() awl_in.close()
# Recognizing_gsl_awl_words.py,part 3-1 # the following is to categorize the words in wordlist_freq_dict # into dictionaries of GSL1000 words,GSL2000 words,AWL words or others gsl1000_words = {} gsl2000_words = {} awl_words = {} other_words = {} for word in wordlist_freq_dict.keys(): if word not in gsl_awl_dict.keys(): other_words[word] = 4 elif gsl_awl_dict[word] == 1: gsl1000_words[word] = wordlist_freq_dict[word] #统计ge.txt出现的gsl1000中的单词次数 elif gsl_awl_dict[word] == 2: gsl2000_words[word] = wordlist_freq_dict[word] elif gsl_awl_dict[word] == 3: awl_words[word] = wordlist_freq_dict[word]
# Recognizing_gsl_awl_words.py,part 3-2
# compute freq total
gsl1000_freq_total = 0
gsl2000_freq_total = 0
awl_freq_total = 0
other_freq_total = 0
for word in gsl1000_words:
gsl1000_freq_total += wordlist_freq_dict[word]
for word in gsl2000_words:
gsl2000_freq_total += wordlist_freq_dict[word]
for word in awl_words:
awl_freq_total += wordlist_freq_dict[word]
for word in other_words:
other_freq_total += wordlist_freq_dict[word]
# Recognizing_gsl_awl_words.py,part 3-3
# to compute the number of words in gsl1000,gsl2000,awl and other words
gsl1000_num_of_words = len(gsl1000_words)
gsl2000_num_of_words = len(gsl2000_words)
awl_num_of_words = len(awl_words)
other_num_of_words = len(other_words)
# 计算ge.txt中总的单词数量
freq_total = gsl1000_freq_total + gsl2000_freq_total + awl_freq_total + other_freq_total
# 计算ge.txt中总的词形数
num_of_words_total = gsl1000_num_of_words + gsl2000_num_of_words + awl_num_of_words + other_num_of_words
# Recognizing_gsl_awl_words.py,part 4-1
# the following is to write out the results
# first,define the file to save the results
file_out = open(r'D:\works\文本分析\range_wordlist_results.txt','a')
# then,write out the results
file_out.write('RESULTS OF WORD ANALYSIS\n\n')
file_out.write('Total No. of word types in Great Expectations: ' + str(num_of_words_total) + '\n\n')
file_out.write('Total No. of GSL1000 word types : ' + str(gsl1000_num_of_words) + '\n\n')
file_out.write('Total No. of GSL2000 word types : ' + str(gsl2000_num_of_words) + '\n')
file_out.write('Total No. of AWL word types : ' + str(awl_num_of_words) + '\n')
file_out.write('Total No. of other word types : ' + str(other_num_of_words) + '\n')
# Recognizing_gsl_words.py,part 4-2
file_out.write('\n\n')
file_out.write('Total word frequency of Great Expectations: ' + str(freq_total) + '\n\n')
file_out.write('Total frequency of GSL1000 words: ' + str(gsl1000_freq_total) + '\n')
file_out.write('Frequency percentage of GSL1000 words: ' + str(gsl1000_freq_total / float(freq_total)) + '\n\n')
file_out.write('Total frequency of GSL2000 words: ' + str(gsl2000_freq_total) + '\n')
file_out.write('Frequency percentage of GSL2000 words: ' + str(gsl2000_freq_total / float(freq_total)) + '\n\n')
file_out.write('Total frequency of AWL words: ' + str(awl_freq_total) + '\n')
file_out.write('Frequency percentage of AWL words: ' + str(awl_freq_total / float(freq_total)) + '\n\n')
file_out.write('Total frequency of other words: ' + str(other_freq_total) + '\n')
file_out.write('Frequency percentage of other words: ' + str(other_freq_total / float(freq_total)) + '\n')
# Recognizing_gsl_awl_words.py, Part 4-3 # write out the GSL1000 words file_out.write('\n\n') file_out.write('##########\n') file_out.write('Words in GSL1000\n\n') for word in sorted(gsl1000_words.keys()): file_out.write(word + '\t' + str(gsl1000_words[word]) + '\n') # write out the GSL2000 words file_out.write('\n\n') file_out.write('##########\n') file_out.write('Words in GSL2000\n\n') for word in sorted(gsl2000_words.keys()): file_out.write(word + '\t' + str(gsl2000_words[word]) + '\n') # write out the AWL words file_out.write('\n\n') file_out.write('##########\n') file_out.write('Words in AWL\n\n') for word in sorted(awl_words.keys()): file_out.write(word + '\t' + str(awl_words[word]) + '\n') # write out other words file_out.write('\n\n') file_out.write('##########\n') file_out.write('Other words\n\n') for word in sorted(other_words.keys()): file_out.write(word + '\t' + str(wordlist_freq_dict[word]) + '\n') file_out.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。