赞
踩
import nltk
string = "My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
string_tokenized = nltk.word_tokenize(string)
string_postagged = nltk.pos_tag(string_tokenized)
string_postagged
[('My', 'PRP$'), ('father', 'NN'), ("'s", 'POS'), ('name', 'NN'), ('being', 'VBG'), ('Pririp', 'NNP'), (',', ','), ('and', 'CC'), ('my', 'PRP$'), ('Christian', 'JJ'), ('name', 'NN'), ('Philip', 'NNP'), (',', ','), ('my', 'PRP$'), ('infant', 'JJ'), ('tongue', 'NN'), ('could', 'MD'), ('make', 'VB'), ('of', 'IN'), ('both', 'DT'), ('names', 'NNS'), ('nothing', 'NN'), ('longer', 'RB'), ('or', 'CC'), ('more', 'JJR'), ('explicit', 'NNS'), ('than', 'IN'), ('Pip', 'NNP'), ('.', '.'), ('So', 'NNP'), (',', ','), ('I', 'PRP'), ('called', 'VBD'), ('myself', 'PRP'), ('Pip', 'NNP'), (',', ','), ('and', 'CC'), ('came', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('called', 'VBN'), ('Pip', 'NNP'), ('.', '.')]
for i in string_postagged:
print(i[0] + '_' + i[1])
My_PRP$ father_NN 's_POS name_NN being_VBG Pririp_NNP ,_, and_CC my_PRP$ Christian_JJ name_NN Philip_NNP ,_, my_PRP$ infant_JJ tongue_NN could_MD make_VB of_IN both_DT names_NNS nothing_NN longer_RB or_CC more_JJR explicit_NNS than_IN Pip_NNP ._. So_NNP ,_, I_PRP called_VBD myself_PRP Pip_NNP ,_, and_CC came_VBD to_TO be_VB called_VBN Pip_NNP
import nltk string = "My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip." # 对字符串进行分句处理 sent_splitter = nltk.data.load('tokenizers/punkt/english.pickle') sents_splitted = sent_splitter.tokenize(string) file_out = open('D:\works\文本分析\sent_postagged.txt','a') # 对分句后的文本进行词性赋码 for sent in sents_splitted: # posttag the sentence sent_tokenized = nltk.word_tokenize(sent) sent_postag = nltk.pos_tag(sent_tokenized) # save the postagged sentence in sent_postagged for i in sent_postag: output = i[0] + '_' + i[1] + ' ' file_out.write(output) file_out.write('\n') file_out.close()
My_PRP$ father_NN 's_POS name_NN being_VBG Pririp_NNP ,_, and_CC my_PRP$ Christian_JJ name_NN Philip_NNP ,_, my_PRP$ infant_JJ tongue_NN could_MD make_VB of_IN both_DT names_NNS nothing_NN longer_RB or_CC more_JJR explicit_NNS than_IN Pip_NNP ._.
So_RB ,_, I_PRP called_VBD myself_PRP Pip_NNP ,_, and_CC came_VBD to_TO be_VB called_VBN Pip_NNP ._.
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('books','n')) #book
print(lemmatizer.lemmatize('went','v')) #go
print(lemmatizer.lemmatize('better','a')) #good
print(lemmatizer.lemmatize('geese')) #goose
#%%
import nltk
from nltk.util import ngrams
string = "My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
string_tokenized = nltk.word_tokenize(string.lower())
n = 4
n_grams = ngrams(string_tokenized,n)
for grams in n_grams:
print(grams)
('my', 'father', "'s", 'name') ('father', "'s", 'name', 'being') ("'s", 'name', 'being', 'pririp') ('name', 'being', 'pririp', ',') ('being', 'pririp', ',', 'and') ('pririp', ',', 'and', 'my') (',', 'and', 'my', 'christian') ('and', 'my', 'christian', 'name') ('my', 'christian', 'name', 'philip') ('christian', 'name', 'philip', ',') ('name', 'philip', ',', 'my') ('philip', ',', 'my', 'infant') (',', 'my', 'infant', 'tongue') ('my', 'infant', 'tongue', 'could') ('infant', 'tongue', 'could', 'make') ('tongue', 'could', 'make', 'of') ('could', 'make', 'of', 'both') ('make', 'of', 'both', 'names') ('of', 'both', 'names', 'nothing') ('both', 'names', 'nothing', 'longer') ('names', 'nothing', 'longer', 'or') ('nothing', 'longer', 'or', 'more') ('longer', 'or', 'more', 'explicit') ('or', 'more', 'explicit', 'than') ('more', 'explicit', 'than', 'pip') ('explicit', 'than', 'pip', '.') ('than', 'pip', '.', 'so') ('pip', '.', 'so', ',') ('.', 'so', ',', 'i') ('so', ',', 'i', 'called') (',', 'i', 'called', 'myself') ('i', 'called', 'myself', 'pip') ('called', 'myself', 'pip', ',') ('myself', 'pip', ',', 'and') ('pip', ',', 'and', 'came') (',', 'and', 'came', 'to') ('and', 'came', 'to', 'be') ('came', 'to', 'be', 'called') ('to', 'be', 'called', 'pip') ('be', 'called', 'pip', '.')
import re import nltk from nltk.util import ngrams string = "My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip." string_tokenized = nltk.word_tokenize(string.lower()) n = 4 n_grams = ngrams(string_tokenized,n) n_grams_AlphaNum = [] for gram in n_grams: # to test if there is any non-alphanumeric character in the ngrams # 过滤掉存在非英文字符的gram for i in range(4): if re.search(r'^\W+$',gram[i]): # \W匹配任何非单词字符。等价于“[^A-Za-z0-9_]” break else: n_grams_AlphaNum.append(gram) for j in n_grams_AlphaNum: print(j)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。