赞
踩
实体识别在实际生活中具有很大的 ,如识别一段文字中的人名,从而为构建知识图谱具有很重要的基础作用。常见的实体识别主要包括人名、地名、时间和组织机构;也可以根据业务的需求构建相应的实体,本文以CRF模型为理论支撑,利用人民日报的语料进行人名、地名、时间以及组织机构识别,从而对一段冗长的信息中提取出所需要的实体信息。
Crf的理论可以参考其资料进行阅读,本文主要是用于学习笔记以及后续其他业务方向的需求做一个技术的基础实践
- import re
- import sklearn_crfsuite#pip install python-crfsuite
- from sklearn_crfsuite import metrics
- from sklearn.externals import joblib
- import pycrfsuite
-
- """初始化"""
- train_corpus_path = "D:\workspace\project\\NLPcase\\ner\\data\\199801.txt"
- process_corpus_path = "D:\workspace\project\\NLPcase\\ner\\data//result-rmrb.txt"
- _maps = {u't': u'T',u'nr': u'PER', u'ns': u'ORG',u'nt': u'LOC'}
-
- def read_corpus_from_file(file_path):
- """读取语料"""
- f = open(train_corpus_path, 'r')#,encoding='utf-8'
- lines = f.readlines()
- f.close()
- return lines
-
- def write_corpus_to_file(data, file_path):
- """写语料"""
- f = open(file_path, 'wb')
- f.write(data)
- f.close()
-
- def q_to_b(q_str):
- """全角转半角"""
- b_str = ""
- for uchar in q_str:
- inside_code = ord(uchar)
- if inside_code == 12288: # 全角空格直接转换
- inside_code = 32
- elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化
- inside_code -= 65248
- b_str += chr(inside_code)
- return b_str
-
- def b_to_q(b_str):
- """半角转全角"""
- q_str = ""
- for uchar in b_str:
- inside_code = ord(uchar)
- if inside_code == 32: # 半角空格直接转化
- inside_code = 12288
- elif 126 >= inside_code >= 32: # 半角字符(除空格)根据关系转化
- inside_code += 65248
- q_str += chr(inside_code)
- return q_str
-
- def pre_process():
- """语料预处理 """
- lines = read_corpus_from_file(train_corpus_path)
- new_lines = []
- flag = 0
- for line in lines:
- flag +=1
- words = q_to_b(line.strip()).split(u' ')
- pro_words = process_t(words)
- pro_words = process_nr(pro_words)
- pro_words = process_k(pro_words)
- new_lines.append(' '.join(pro_words[1:]))
- if flag==100:
- break
- write_corpus_to_file(data='\n'.join(new_lines).encode('utf-8'), file_path=process_corpus_path)
-
- def process_k( words):
- """处理大粒度分词,合并语料库中括号中的大粒度分词,类似:[巴萨/n 俱乐部/n]nt """
- pro_words = []
- index = 0
- temp = u''
- while True:
- word = words[index] if index < len(words) else u''
- if u'[' in word:
- temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=word.replace(u'[', u''))
- elif u']' in word:
- w = word.split(u']')
- temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=w[0])
- pro_words.append(temp+u'/'+w[1])
- temp = u''
- elif temp:
- temp += re.sub(pattern=u'/[a-zA-Z]*', repl=u'', string=word)
- elif word:
- pro_words.append(word)
- else:
- break
- index += 1
- return pro_words
-
- def process_nr( words):
- """ 处理姓名,合并语料库分开标注的姓和名,类似:温/nr 家宝/nr"""
- pro_words = []
- index = 0
- while True:
- word = words[index] if index < len(words) else u''
- if u'/nr' in word:
- next_index = index + 1
- if next_index < len(words) and u'/nr' in words[next_index]:
- pro_words.append(word.replace(u'/nr', u'') + words[next_index])
- index = next_index
- else:
- pro_words.append(word)
- elif word:
- pro_words.append(word)
- else:
- break
- index += 1
- return pro_words
-
- def process_t( words):
- """处理时间,合并语料库分开标注的时间词,类似: (/w 一九九七年/t 十二月/t 三十一日/t )/w """
- pro_words = []
- index = 0
- temp = u''
- while True:
- word = words[index] if index < len(words) else u''
- if u'/t' in word:
- temp = temp.replace(u'/t', u'') + word
- elif temp:
- pro_words.append(temp)
- pro_words.append(word)
- temp = u''
- elif word:
- pro_words.append(word)
- else:
- break
- index += 1
- return pro_words
-
- def pos_to_tag( p):
- """由词性提取标签"""
- t = _maps.get(p, None)
- return t if t else u'O'
-
- def tag_perform( tag, index):
- """标签使用BIO模式"""
- if index == 0 and tag != u'O':
- return u'B_{}'.format(tag)
- elif tag != u'O':
- return u'I_{}'.format(tag)
- else:
- return tag
-
- def pos_perform( pos):
- """去除词性携带的标签先验知识"""
- if pos in _maps.keys() and pos != u't':
- return u'n'
- else:
- return pos
-
- def initialize():
- """初始化 """
-
- lines = read_corpus_from_file(process_corpus_path)
- words_list = [line.strip().split(' ') for line in lines if line.strip()]
- del lines
- # init_sequence(words_list)
- return init_sequence(words_list)
-
- def init_sequence(words_list):
- """初始化字序列、词性序列、标记序列 """
- words_seq = [[word.split(u'/')[0] for word in words] for words in words_list]
- pos_seq = [[word.split(u'/')[1] for word in words] for words in words_list]
- tag_seq = [[pos_to_tag(p) for p in pos] for pos in pos_seq]
- pos_seq = [[[pos_seq[index][i] for _ in range(len(words_seq[index][i]))]
- for i in range(len(pos_seq[index]))] for index in range(len(pos_seq))]
- tag_seq = [[[tag_perform(tag_seq[index][i], w) for w in range(len(words_seq[index][i]))]
- for i in range(len(tag_seq[index]))] for index in range(len(tag_seq))]
- pos_seq = [[u'un']+[pos_perform(p) for pos in pos_seq for p in pos]+[u'un'] for pos_seq in pos_seq]
- tag_seq = [[t for tag in tag_seq for t in tag] for tag_seq in tag_seq]
- word_seq = [[u'<BOS>']+[w for word in word_seq for w in word]+[u'<EOS>'] for word_seq in words_seq]
- return pos_seq,tag_seq,word_seq
- pre_process()
- pos_seq,tag_seq,word_seq = initialize()
-
- def extract_feature( word_grams):
- """特征选取"""
- features, feature_list = [], []
- for index in range(len(word_grams)):
- for i in range(len(word_grams[index])):
- word_gram = word_grams[index][i]
- feature = {u'w-1': word_gram[0], u'w': word_gram[1], u'w+1': word_gram[2],
- u'w-1:w': word_gram[0]+word_gram[1], u'w:w+1': word_gram[1]+word_gram[2],
- # u'p-1': self.pos_seq[index][i], u'p': self.pos_seq[index][i+1],
- # u'p+1': self.pos_seq[index][i+2],
- # u'p-1:p': self.pos_seq[index][i]+self.pos_seq[index][i+1],
- # u'p:p+1': self.pos_seq[index][i+1]+self.pos_seq[index][i+2],
- u'bias': 1.0}
- feature_list.append(feature)
- features.append(feature_list)
- feature_list = []
- return features
-
- def segment_by_window( words_list=None, window=3):
- """窗口切分"""
- words = []
- begin, end = 0, window
- for _ in range(1, len(words_list)):
- if end > len(words_list): break
- words.append(words_list[begin:end])
- begin = begin + 1
- end = end + 1
- return words
-
- def generator():
- """训练数据"""
- word_grams = [segment_by_window(word_list) for word_list in word_seq]
- features = extract_feature(word_grams)
- return features, tag_seq
- #------------------# 最为主要是构造每个单词的feature与观测序列tag对应-------------------
-
- '''初始化参数'''
- algorithm = 'lbfgs'
- c1 = "0.1"
- c2 = "0.1"
- max_iterations = 100
- model_path = "D:\workspace\project\\NLPcase\\ner\\model\\model.pkl"
- model = sklearn_crfsuite.CRF(algorithm=algorithm, c1=c1, c2=c2,
- max_iterations=max_iterations, all_possible_transitions=True)
-
- def save_model(model,model_path):
- """保存模型"""
- joblib.dump(model, model_path)
- def load_model(model_path):
- """保存模型"""
- return joblib.load(model_path)
- # 对模型进行训练
- def train(model_path):
- x,y = generator()
- x_train, y_train = x[500:], y[500:]
- x_test, y_test = x[:500], y[:500]
- model.fit(x_train, y_train)
- labels = list(model.classes_)
- labels.remove('O')
- y_predict = model.predict(x_test)
- metrics.flat_f1_score(y_test, y_predict, average='weighted', labels=labels)
- sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
- save_model(model,model_path)
- def predict(sent):
- model = load_model(model_path)
- u_sent = q_to_b(sent)
- word_lists = [[u'<BOS>']+[c for c in u_sent]+[u'<EOS>']]
- word_grams = [segment_by_window(word_list) for word_list in word_lists]
- features = extract_feature(word_grams)
- y_predict = model.predict(features)
- entity = u''
- for index in range(len(y_predict[0])):
- if y_predict[0][index] != u'O':
- if index>0 and y_predict[0][index][-1] != y_predict[0][index-1][-1]:
- entity += u' '
- entity += u_sent[index]
- elif entity[-1] != u' ':
- entity += u' '
- return entity
-

参考资料
https://blog.csdn.net/leitouguan8655/article/details/83382412
https://blog.csdn.net/lhxsir/article/details/83387240
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。