医疗问答系统详解一

医疗问答系统介绍:博客

这部分是学习如何将用户在问答系统中的输入(自然语言)转化成知识库的查询语句

知识库问答

知识库问答(Knowledge Base Question Answering, KBQA):给定自然语言问题,通过对问题进行语义理解和解析,进而利用知识库进行查询、推理得出答案。

医疗问答系统的意图识别

用户的问题通过命名实体模块可以将问题中的相关实体获取到,接着根据问题中包含的实体进行意图识别

相应的主体类EntityExtractor框架如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class EntityExtractor:
def __init__(self):
pass

# 构造actree,加速过滤
def build_actree(self, wordlist):
"""
构造actree,加速过滤
:param wordlist:
:return:
"""
pass

# 模式匹配, 得到匹配的词和类型。如疾病,疾病别名,并发症,症状
def entity_reg(self, question):
"""
模式匹配, 得到匹配的词和类型。如疾病,疾病别名,并发症,症状
:param question:str
:return:
"""
pass

# 当全匹配失败时,就采用相似度计算来找相似的词
def find_sim_words(self, question):
"""
当全匹配失败时,就采用相似度计算来找相似的词
:param question:
:return:
"""
pass

# 采用DP方法计算编辑距离
def editDistanceDP(self, s1, s2):
"""
采用DP方法计算编辑距离
:param s1:
:param s2:
:return:
"""
pass

# 计算词语和字典中的词的相似度
def simCal(self, word, entities, flag):
"""
计算词语和字典中的词的相似度
相同字符的个数/min(|A|,|B|) + 余弦相似度
:param word: str
:param entities:List
:return:
"""
pass

# 基于特征词分类
def check_words(self, wds, sent):
"""
基于特征词分类
:param wds:
:param sent:
:return:
"""
pass

# 提取问题的TF-IDF特征
def tfidf_features(self, text, vectorizer):
"""
提取问题的TF-IDF特征
:param text:
:param vectorizer:
:return:
"""
pass

# 提取问题的关键词特征
def other_features(self, text):
"""
提取问题的关键词特征
:param text:
:return:
"""
pass

# 预测意图
def model_predict(self, x, model):
"""
预测意图
:param x:
:param model:
:return:
"""
pass

# 实体抽取主函数
def extractor(self, question):
pass

命名实体模块

  • 整体思路
    1. 对于用户的输入,先使用预先构建的疾病、疾病别名、并发症和症状的AC Tree进行匹配。
    2. 如果所有AC Tree树都无法匹配到实体,则使用jieba中的分词库对用户输入的文本进行切分,将每一个词都与疾病词库、疾病别名词库、并发症词库和症状词库中的词计算相似度得分(overlap score,余弦相似度分数,编辑距离分数)。如果相似度得分超过0.7,则认为该词是这一类实体。
    3. 最后排序选取最相关的词作为实体。

项目的所有实体类型共有7类Disease,Alias,Symptom,Part,Department,Complication,Drug,但实体识别时仅使用了Disease,Alias,Complication,Symptom

  • 项目相关代码

1 构建AC Tree

1
2
3
4
5
6
7
8
9
import ahocorasick
## 构造AC Tree,加速过滤
def build_actree(self, wordlist):
actree = ahocorasick.Automaton()

for index,word in enumerate(word_list):
actree.add(word, (index, word))
actree.make_automaton()
return actree

函数调用模块:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def __init__(self):
self.disease_path = cur_dir + 'disease_vocab.txt'
self.symptom_path = cur_dir + 'symptom_vocab.txt'
self.alias_path = cur_dir + 'alias_vocab.txt'
self.complication_path = cur_dir + 'complications_vocab.txt'

self.disease_entities = [w.strip() for w in open(self.disease_path, encoding='utf-8') if w.strip()]
self.symptom_entities = [w.strip() for w in open(self.symptom_path, encoding='utf-8') if w.strip()]
self.alias_entities = [w.strip() for w in open(self.alias_path, encoding='utf-8') if w.strip()]
self.complication_entities = [w.strip() for w in open(self.complication_path, encoding='utf-8') if w.strip()]

## 构造不同实体的AC Tree
self.disease_tree = self.build_actree(list(set(self.disease_entities)))
self.alias_tree = self.build_actree(list(set(self.alias_entities)))
self.symptom_tree = self.build_actree(list(set(self.symptom_entities)))
self.complication_tree = self.build_actree(list(set(self.complication_entities)))

2 使用AC Tree进行问句过滤(实体匹配)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
##模式匹配,得到匹配的词和类型。
def entity_reg(self, quesiton):
self.result = {}

for i in self.disease_tree.iter(question):
word = i[1][1]
if "Disease" not in self.result:
self.result["Disease"] = [word]
else:
self.result["Disease"].append(word)

for i in self.alias_tree.iter(question):
word = i[1][1]
if "Alias" not in self.result:
self.result["Alias"] = [word]
else:
self.result["Alias"].append(word)

for i in self.symptom_tree.iter(question):
word = i[1][1]
if "Symptom" not in self.result:
self.result["Symptom"] = [word]
else:
self.result["Symptom"].append(word)

for i in self.complication_tree.iter(question):
word = i[1][1]
if "Complication" not in self.result:
self.result["Complication"] = [word]
else:
self.result["Complication"] .append(word)
return self.result

函数调用模块:

1
2
3
def extractor(self, question):
self.entity_reg(question)
## ....

3 利用相似度进行的实体匹配

当没有通过AC Tree匹配到实体时,使用查找相似词的方式进行实体匹配。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import string
from gensim.models import KeyedVectors
import jieba
def find_sim_words(self, question):
## 加载自定义词典
jieba.load_userdict(self.vocab_path)
## 加载词向量
self.model = KeyedVectors.load_word2vec_format(self.word2vec_path, binary=False)

## 数据预处理,利用正则去除特殊符号
sentence = re.sub("[{}]", re.escape(string.punctuation), question)
sentence = re.sub("[,。‘’;:?、!【】]", " ", sentence)
sentence = sentence.strip()

## 分词 (stopwords为停顿词)
words = [w.strip() for w in jieba.cut(sentence) if w.strip() not in self.stopwords and len(w.strip()) >= 2]

alist = []

## 对每个词都进行与实体字典的相似性对比,最终选取分数稿的实体和它属于的实体类型
for word in words:
temp = [self.disease_entities, self.alias_entities, self.symptom_entities, self.complication_entities]
for i in range(len(temp)):
flag = ''
if i == 0:
flag = "Disease"
elif i == 1:
flag = "Alias"
elif i == 2:
flag = "Symptom"
else:
flag = "Complication"
scores = self.simCal(word, temp[i], flag)
alist.extend(scores)
temp1 = sorted(alist, key=lambda k: k[1], reverse=True)
if temp1:
self.result[temp1[0][2]] = [temp1[0][0]]

## 计算词语和字典中的词的相似度:相同字符的个数/min(|A|,|B|) + 余弦相似度
def simCal(self, word, entities, flag):
a = len(word)
scores = []
for entity in entities:
sim_num = 0
b = len(entity)
c = len(set(entity+word))

temp = []
for w in word:
if w in entity:
sim_num += 1
if sim_sum != 0:
score1 = sim_num / c
temp.append(score1)
try:
## 余弦相似度
score2 = self.model.similarity(word, entity)
temp.append(score2)
except:
pass
score3 = 1 - self.editDistanceDP(word, entity) / (a + b) # 编辑距离分数
if score3:
temp.append(score3)

score = sum(temp) / len(temp)
if score >= 0.7:
scores.append((entity, score, flag))

scores.sort(key=lambda k: k[1], reverse=True)
return scores

意图识别模块

  • 整体思路
    1. 利用TF-IDF表征文本特征,同时构建一些人工特征(每一类意图常见词在句子中出现的个数)。
    2. 训练朴素贝叶斯模型进行意图识别任务。
    3. 使用实体信息进行意图的纠正和补充。
  • 项目相关代码

1 特征构建

关于TF-IDF:我的博客

TF-IDF特征:

1
2
3
4
5
6
7
8
## 提取问题的TF-IDF特征
def tfidf_features(self, text, vectorizer):
jieba.load_userdict(self.vocab_path)
words = [w.strip() for w in jieba.cut(text) if w.strip() and w.strip() not in self.stopwords]
sents = [' '.join(words)]

tfidf = vectorizer.transform(sents).toarray()
return tfidf

人工特征:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
self.symptom_qwds = ['什么症状', '哪些症状', '症状有哪些', '症状是什么', '什么表征', '哪些表征', '表征是什么',
'什么现象', '哪些现象', '现象有哪些', '症候', '什么表现', '哪些表现', '表现有哪些',
'什么行为', '哪些行为', '行为有哪些', '什么状况', '哪些状况', '状况有哪些', '现象是什么',
'表现是什么', '行为是什么'] # 询问症状
self.cureway_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片', '吃什么药', '用什么药', '怎么办',
'买什么药', '怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治',
'医治方式', '疗法', '咋治', '咋办', '咋治', '治疗方法'] # 询问治疗方法
self.lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时',
'几个小时', '多少年', '多久能好', '痊愈', '康复'] # 询问治疗周期
self.cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例',
'可能性', '能治', '可治', '可以治', '可以医', '能治好吗', '可以治好吗', '会好吗',
'能好吗', '治愈吗'] # 询问治愈率
self.check_qwds = ['检查什么', '检查项目', '哪些检查', '什么检查', '检查哪些', '项目', '检测什么',
'哪些检测', '检测哪些', '化验什么', '哪些化验', '化验哪些', '哪些体检', '怎么查找',
'如何查找', '怎么检查', '如何检查', '怎么检测', '如何检测'] # 询问检查项目
self.belong_qwds = ['属于什么科', '什么科', '科室', '挂什么', '挂哪个', '哪个科', '哪些科'] # 询问科室
self.disease_qwds = ['什么病', '啥病', '得了什么', '得了哪种', '怎么回事', '咋回事', '回事',
'什么情况', '什么问题', '什么毛病', '啥毛病', '哪种病'] # 询问疾病

## 提取问题的关键字特征
def other_features(self, text):
features = [0] * 7
for d in self.disease_qwds:
if d in text:
features[0] += 1
for s in self.symptom_qwds:
if s in text:
features[1] +=1
for c in in self.cureway_qwds:
if c in text:
features[2] += 1

for c in self.check_qwds:
if c in text:
features[3] += 1
for p in self.lasttime_qwds:
if p in text:
features[4] += 1

for r in self.cureprob_qwds:
if r in text:
features[5] += 1

for d in self.belong_qwds:
if d in text:
features[6] += 1

m = max(features)
n = min(features)
normed_features = []
if m == n:
normed_features = features
else:
for i in features:
j = (i - n) / (m - n)
normed_features.append(j)
return np.array(normed_features)

2 使用训练好的朴素贝叶斯模型进行文本分类

1
2
3
4
5
6
7
8
9
10
11
12
13
# 意图分类模型文件
self.tfidf_path = os.path.join(cur_dir, 'model/tfidf_model.m')
self.nb_path = os.path.join(cur_dir, 'model/intent_reg_model.m') #朴素贝叶斯模型
self.tfidf_model = joblib.load(self.tfidf_path)
self.nb_model = joblib.load(self.nb_path)

tfidf_feature = self.tfidf_features(question, self.tfidf_model)
other_feature = self.other_features(question)
m = other_feature.shape
other_feature = np.reshape(other_feature, (1, m[0]))
feature = np.concatenate((tfidf_feature, other_feature), axis=1)
predicted = self.model_predict(feature, self.nb_model)
intentions.append(predicted[0])

3 根据所识别的实体进行补充和纠正意图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# 已知疾病,查询症状
if self.check_words(self.symptom_qwds, question) and ('Disease' in types or 'Alia' in types):
intention = "query_symptom"
if intention not in intentions:
intentions.append(intention)
# 已知疾病或症状,查询治疗方法
if self.check_words(self.cureway_qwds, question) and \
('Disease' in types or 'Symptom' in types or 'Alias' in types or 'Complication' in types):
intention = "query_cureway"
if intention not in intentions:
intentions.append(intention)
# 已知疾病或症状,查询治疗周期
if self.check_words(self.lasttime_qwds, question) and ('Disease' in types or 'Alia' in types):
intention = "query_period"
if intention not in intentions:
intentions.append(intention)
# 已知疾病,查询治愈率
if self.check_words(self.cureprob_qwds, question) and ('Disease' in types or 'Alias' in types):
intention = "query_rate"
if intention not in intentions:
intentions.append(intention)
# 已知疾病,查询检查项目
if self.check_words(self.check_qwds, question) and ('Disease' in types or 'Alias' in types):
intention = "query_checklist"
if intention not in intentions:
intentions.append(intention)
# 查询科室
if self.check_words(self.belong_qwds, question) and \
('Disease' in types or 'Symptom' in types or 'Alias' in types or 'Complication' in types):
intention = "query_department"
if intention not in intentions:
intentions.append(intention)
# 已知症状,查询疾病
if self.check_words(self.disase_qwds, question) and ("Symptom" in types or "Complication" in types):
intention = "query_disease"
if intention not in intentions:
intentions.append(intention)

# 若没有检测到意图,且已知疾病,则返回疾病的描述
if not intentions and ('Disease' in types or 'Alias' in types):
intention = "disease_describe"
if intention not in intentions:
intentions.append(intention)
# 若是疾病和症状同时出现,且出现了查询疾病的特征词,则意图为查询疾病
if self.check_words(self.disase_qwds, question) and ('Disease' in types or 'Alias' in types) \
and ("Symptom" in types or "Complication" in types):
intention = "query_disease"
if intention not in intentions:
intentions.append(intention)
# 若没有识别出实体或意图则调用其它方法
if not intentions or not types:
intention = "QA_matching"
if intention not in intentions:
intentions.append(intention)

self.result["intentions"] = intentions

参考链接:链接1

参考链接:链接2