赞
踩
文本特征是从语音到文本模型或转录模型的输出转录本派生的任何语音特征。
Text Feature | Description | Use case |
---|---|---|
Keyword frequency | 单词“basketball”相对于单词总数的计数,有助于确定主题。 | Useful to determine topics. |
[字符频率](https://en.wikipedia.org/wiki/Character_(符号)) | 相对于所有字符的字母““一个”计数 | 字母频率代表语音中的音素,有时会提高模型的准确性。国际音标提供了英语音素的标准列表 |
情感极性 | 阳性、阴性或中性可以检测转录本的内容是阳性、阴性还是中性; | 有助于检测情感内容 |
形态特征 | 动词的过去、现在或将来时(引理和表面形式) | 在对话中查看基于时间的内容很有用 |
句法特征 | 标记和词性之间的依赖关系(例如,整个文本中的名词-动词-名词频率)。 | 生物识别-人们有一个非常独特的语法来描述他们的互动。计算量会更大 |
命名实体识别 | 一个特定的人,吉姆,在抄本中被使用的频率。 | 有助于确定会话中某些事情的相关性,或用于主题标记 |
nltk_features.py
import nltk from nltk import word_tokenize import speech_recognition as sr_audio import numpy as np from textblob import TextBlob import transcribe as ts def nltk_featurize(file): # get transcript transcript=ts.transcribe_sphinx('test.wav') #alphabetical features a=transcript.count('a') b=transcript.count('b') c=transcript.count('c') d=transcript.count('d') e=transcript.count('e') f=transcript.count('f') g_=transcript.count('g') h=transcript.count('h') i=transcript.count('i') j=transcript.count('j') k=transcript.count('k') l=transcript.count('l') m=transcript.count('m') n=transcript.count('n') o=transcript.count('o') p=transcript.count('p') q=transcript.count('q') r=transcript.count('r') s=transcript.count('s') t=transcript.count('t') u=transcript.count('u') v=transcript.count('v') w=transcript.count('w') x=transcript.count('x') y=transcript.count('y') z=transcript.count('z') space=transcript.count(' ') #numerical features and capital letters num1=transcript.count('0')+transcript.count('1')+transcript.count('2')+transcript.count('3')+transcript.count('4')+transcript.count('5')+transcript.count('6')+transcript.count('7')+transcript.count('8')+transcript.count('9') num2=transcript.count('zero')+transcript.count('one')+transcript.count('two')+transcript.count('three')+transcript.count('four')+transcript.count('five')+transcript.count('six')+transcript.count('seven')+transcript.count('eight')+transcript.count('nine')+transcript.count('ten') number=num1+num2 capletter=sum(1 for c in transcript if c.isupper()) #part of speech text=word_tokenize(transcript) g=nltk.pos_tag(transcript) cc=0 cd=0 dt=0 ex=0 in_=0 jj=0 jjr=0 jjs=0 ls=0 md=0 nn=0 nnp=0 nns=0 pdt=0 pos=0 prp=0 prp2=0 rb=0 rbr=0 rbs=0 rp=0 to=0 uh=0 vb=0 vbd=0 vbg=0 vbn=0 vbp=0 vbp=0 vbz=0 wdt=0 wp=0 wrb=0 for i in range(len(g)): if g[i][1] == 'CC': cc=cc+1 elif g[i][1] == 'CD': cd=cd+1 elif g[i][1] == 'DT': dt=dt+1 elif g[i][1] == 'EX': ex=ex+1 elif g[i][1] == 'IN': in_=in_+1 elif g[i][1] == 'JJ': jj=jj+1 elif g[i][1] == 'JJR': jjr=jjr+1 elif g[i][1] == 'JJS': jjs=jjs+1 elif g[i][1] == 'LS': ls=ls+1 elif g[i][1] == 'MD': md=md+1 elif g[i][1] == 'NN': nn=nn+1 elif g[i][1] == 'NNP': nnp=nnp+1 elif g[i][1] == 'NNS': nns=nns+1 elif g[i][1] == 'PDT': pdt=pdt+1 elif g[i][1] == 'POS': pos=pos+1 elif g[i][1] == 'PRP': prp=prp+1 elif g[i][1] == 'PRP$': prp2=prp2+1 elif g[i][1] == 'RB': rb=rb+1 elif g[i][1] == 'RBR': rbr=rbr+1 elif g[i][1] == 'RBS': rbs=rbs+1 elif g[i][1] == 'RP': rp=rp+1 elif g[i][1] == 'TO': to=to+1 elif g[i][1] == 'UH': uh=uh+1 elif g[i][1] == 'VB': vb=vb+1 elif g[i][1] == 'VBD': vbd=vbd+1 elif g[i][1] == 'VBG': vbg=vbg+1 elif g[i][1] == 'VBN': vbn=vbn+1 elif g[i][1] == 'VBP': vbp=vbp+1 elif g[i][1] == 'VBZ': vbz=vbz+1 elif g[i][1] == 'WDT': wdt=wdt+1 elif g[i][1] == 'WP': wp=wp+1 elif g[i][1] == 'WRB': wrb=wrb+1 #sentiment tblob=TextBlob(transcript) polarity=float(tblob.sentiment[0]) subjectivity=float(tblob.sentiment[1]) #word repeats words=transcript.split() newlist=transcript.split() repeat=0 for i in range(len(words)): newlist.remove(words[i]) if words[i] in newlist: repeat=repeat+1 features=np.array([a,b,c,d, e,f,g_,h, i,j,k,l, m,n,o,p, q,r,s,t, u,v,w,x, y,z,space,number, capletter,cc,cd,dt, ex,in_,jj,jjr, jjs,ls,md,nn, nnp,nns,pdt,pos, prp,prp2,rbr,rbs, rp,to,uh,vb, vbd,vbg,vbn,vbp, vbz,wdt,wp,wrb, polarity,subjectivity,repeat]) labels=['a', 'b', 'c', 'd', 'e','f','g','h', 'i', 'j', 'k', 'l', 'm','n','o', 'p', 'q','r','s','t', 'u','v','w','x', 'y','z','space', 'numbers', 'capletters','cc','cd','dt', 'ex','in','jj','jjr', 'jjs','ls','md','nn', 'nnp','nns','pdt','pos', 'prp','prp2','rbr','rbs', 'rp','to','uh','vb', 'vbd','vbg','vbn','vbp', 'vbz', 'wdt', 'wp','wrb', 'polarity', 'subjectivity','repeat'] return features, labels # transcribe with pocketsphinx features, labels = nltk_featurize('test.wav')
spacy_features.py
import spacy_features
# Alice’s Adventures in Wonderland = text
transcript=open('alice.txt').read()
features, labels = spacy_featurize(transcript)
# shows feature array with labels = 315 features total
print(features)
print(labels)
print(len(features))
print(len(labels))
gensim_features.py
import os import numpy as np from gensim.models import Word2Vec def w2v_train(textlist,size,modelname): sentences=list() #split into individual word embeddings for i in range(len(textlist)): if len(textlist[i].split())==0: pass else: sentences.append(textlist[i].split()) #test (for small samples) #print(sentences) model = Word2Vec(sentences, size=size, window=5, min_count=1, workers=4) if modelname in os.listdir(): #do not save if already file in folder with same name pass else: print('saving %s to disk...'%(modelname)) model.save(modelname) return model def sentence_embedding(sentence,size,modelname): model=Word2Vec.load(modelname) sentences2=sentence.split() w2v_embed=list() for i in range(len(sentences2)): try: #print(sentences2[i]) w2v_embed.append(model[sentences2[i]]) #print(model[sentences2[i]]) except: #pass if there is an error to not distort averages... :) pass out_embed=np.zeros(size) for j in range(len(w2v_embed)): out_embed=out_embed+w2v_embed[j] out_embed=(1/size)*out_embed return out_embed # load alice and wonderland corpus and build w2v model text=open('alice.txt').read() transcript='I had a great time at the bar today.' modelname='alice.pickle' w2v_train(text,100,modelname) features=sentence_embedding(transcript, 100,modelname) print(features) print(len(features))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。