赞
踩
命名实体识别(Named Entity Recognition,简称NER)
与自动分词,词性标注一样,命名实体识别也是自然语言处理中的一个基础任务,其目的是识别语料中的人名、地名、组织机构名等命名实体。
基于规则的通常有两种方法
第一是基于正则表达式的匹配,第二可以通过StanfordCoreNLP
StanfordCoreNLP方法:
ner.py: 主调用文件,用来读取文本
#-*- coding=utf8 -*-
from grammer.rules import grammer_parse
fp = open('text.txt', 'r', encoding='utf8')
fout = open('out.txt','w',encoding='utf8')
# 此处可以直接遍历,也可以通过遍历readlines
[grammer_parse(line.strip(), fout) for line in fp if len(line.strip())>0]
fp.close()
fout.close()
rules.py: 该代码用来定义规则
#encoding=utf8 import nltk,json from .tools import ner_stanford,cut_stanford def get_stanford_ner_nodes(parent): # 对得到的树进行遍历 date='' num='' org='' loc='' for node in parent: if type(node) is nltk.Tree: if node.label() == 'DATE' : date=date+" "+''.join([i[0] for i in node]) elif node.label() == 'NUMBER': num=num+" "+''.join([i[0] for i in node]) elif node.label() == 'ORGANIZATIONL' : org=org+" "+''.join([i[0] for i in node]) elif node.label() == 'LOCATION': loc=loc+" "+''.join([i[0] for i in node]) if len(num)>0 or len(date)>0 or len(org)>0 or len(loc)>0 : return {'date':date,'num':num,'org':org,'loc':loc} else: return {} def grammer_parse(raw_sentence=None,file_object=None): #assert grammer_type in set(['hanlp_keep','stanford_ner_drop','stanford_pos_drop']) # 如果文本太短,则直接跳过 if len(raw_sentence.strip())<5: return False # 定义语法:<DATE>+ 只要Date出现,一次或者多次,都是属于一个Date grammer_dict=\ { 'stanford_ner_drop': r""" DATE:{<DATE>+<MISC>?<DATE>*<O>{2}} {<DATE>+<MISC>?<DATE>*} {<DATE>+} {<TIME>+} ORGANIZATIONL:{<ORGANIZATION>+} LOCATION:{<LOCATION|STATE_OR_PROVINCE|CITY|COUNTRY>+} """ } # 通过NLTK来对语法进行解析 stanford_ner_drop_rp = nltk.RegexpParser(grammer_dict['stanford_ner_drop']) try : # ner_stanford(raw_sentence)就是将关键字命名体进行了识别,O指的意思是没有我们规定的类型 # 得到的stanford_ner_drop_result为draw类型,可以通过draw()方法进行绘制 stanford_ner_drop_result = stanford_ner_drop_rp.parse(ner_stanford(raw_sentence) ) except: print("the error sentence is {}".format(raw_sentence)) else: # 将得到的树类型的结果按照规则对结点进行合并 stanford_keep_drop_dict=get_stanford_ner_nodes(stanford_ner_drop_result) if len(stanford_keep_drop_dict)>0 : # 将字典写入文件,通过json.dumps将字符串转化为json数据 file_object.write(json.dumps(stanford_keep_drop_dict, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True, cls=None, indent=4, separators=None, default=None, sort_keys=False))
tools.py: 用来支持stanford_nlp
#encoding=utf8 import os,gc,re,sys from itertools import chain from stanfordcorenlp import StanfordCoreNLP import logging from jpype import * startJVM(getDefaultJVMPath(),r"-Djava.class.path=E:\NLP\hanlp\hanlp-1.5.0.jar;E:\NLP\hanlp", "-Xms1g", "-Xmx1g") NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer') stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh', quiet=False, logging_level=logging.DEBUG) # stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh') drop_pos_set=set(['xu','xx','y','yg','wh','wky','wkz','wp','ws','wyy','wyz','wb','u','ud','ude1','ude2','ude3','udeng','udh']) han_pattern=re.compile(r'[^\dA-Za-z\u3007\u4E00-\u9FCB\uE815-\uE864]+') def to_string(sentence,return_generator=False): if return_generator: return (word_pos_item.toString().split('/') for word_pos_item in Tokenizer.segment(sentence)) else: # res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)] return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)] def to_string_hanlp(sentence,return_generator=False): if return_generator: return (word_pos_item.toString().split('/') for word_pos_item in HanLP.segment(sentence)) else: # res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)] return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)] def seg_sentences(sentence,with_filter=True,return_generator=False): segs=to_string(sentence,return_generator=return_generator) #print(segs) #g=[] if with_filter: g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ' and word_pos_pair[1] not in drop_pos_set] else: g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' '] return iter(g) if return_generator else g def ner_stanford(raw_sentence,return_list=True): if len(raw_sentence.strip())>0: return stanford_nlp.ner(raw_sentence) if return_list else iter(stanford_nlp.ner(raw_sentence)) def ner_hanlp(raw_sentence,return_list=True): if len(raw_sentence.strip())>0: return NLPTokenizer.segment(raw_sentence) if return_list else iter(NLPTokenizer.segment(raw_sentence)) def cut_stanford(raw_sentence,return_list=True): if len(raw_sentence.strip())>0: return stanford_nlp.pos_tag(raw_sentence) if return_list else iter(stanford_nlp.pos_tag(raw_sentence))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。