neo4j数据导入
neo4j的安装不在赘述,详情可以看这篇blog:juejin.cn/post/708044…
我们首先先开启neo4j
neo4j.bat console
我们可以在浏览器中打开:127.0.0.1:7474 查看打开情况
build_medicalgraph.py
- 修复三个源代码中的问题:
- 连接方式的改变,作者的方式已经被弃用,使用现在的方式
Graph("http://localhost:7474", auth=("neo4j","waws"))
- self.graph 在代码中多次被写成self.g 导致找不到相对应属性,报错
- 在mian中补充创建node和relationship的调用
- 整个代码的逻辑:就是从data/medical.json读取数据并将其在neo4j中节点的创建和关系关联
- 运行方式:python build_medicalgraph.py
- 导入的时间比较耗时,大概需要
半小时~一个小时
,因电脑而异- 有一个bug上面提到过:self.graph 写成了 self.g 修改下即可
# 比较简单的流程化代码,就是读取数据然后将其传入到neo4j的语句中,然后使用py2neo的运行语句
import os
import json
from py2neo import Graph,Node
class MedicalGraph:
def __init__(self):
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.data_path = os.path.join(cur_dir, 'data/medical.json')
self.graph = Graph("http://localhost:7474", auth=("neo4j","waws"))
'''读取文件'''
def read_nodes(self):
# 共7类节点
drugs = [] # 药品
foods = [] # 食物
checks = [] # 检查
departments = [] #科室
producers = [] #药品大类
diseases = [] #疾病
symptoms = []#症状
disease_infos = []#疾病信息
# 构建节点实体关系
rels_department = [] # 科室-科室关系
rels_noteat = [] # 疾病-忌吃食物关系
rels_doeat = [] # 疾病-宜吃食物关系
rels_recommandeat = [] # 疾病-推荐吃食物关系
rels_commonddrug = [] # 疾病-通用药品关系
rels_recommanddrug = [] # 疾病-热门药品关系
rels_check = [] # 疾病-检查关系
rels_drug_producer = [] # 厂商-药物关系
rels_symptom = [] #疾病症状关系
rels_acompany = [] # 疾病并发关系
rels_category = [] # 疾病与科室之间的关系
count = 0
for data in open(self.data_path,"r",encoding="utf-8"):
disease_dict = {}
count += 1
print(count)
data_json = json.loads(data)
disease = data_json['name']
disease_dict['name'] = disease
diseases.append(disease)
disease_dict['desc'] = ''
disease_dict['prevent'] = ''
disease_dict['cause'] = ''
disease_dict['easy_get'] = ''
disease_dict['cure_department'] = ''
disease_dict['cure_way'] = ''
disease_dict['cure_lasttime'] = ''
disease_dict['symptom'] = ''
disease_dict['cured_prob'] = ''
if 'symptom' in data_json:
symptoms += data_json['symptom']
for symptom in data_json['symptom']:
rels_symptom.append([disease, symptom])
if 'acompany' in data_json:
for acompany in data_json['acompany']:
rels_acompany.append([disease, acompany])
if 'desc' in data_json:
disease_dict['desc'] = data_json['desc']
if 'prevent' in data_json:
disease_dict['prevent'] = data_json['prevent']
if 'cause' in data_json:
disease_dict['cause'] = data_json['cause']
if 'get_prob' in data_json:
disease_dict['get_prob'] = data_json['get_prob']
if 'easy_get' in data_json:
disease_dict['easy_get'] = data_json['easy_get']
if 'cure_department' in data_json:
cure_department = data_json['cure_department']
if len(cure_department) == 1:
rels_category.append([disease, cure_department[0]])
if len(cure_department) == 2:
big = cure_department[0]
small = cure_department[1]
rels_department.append([small, big])
rels_category.append([disease, small])
disease_dict['cure_department'] = cure_department
departments += cure_department
if 'cure_way' in data_json:
disease_dict['cure_way'] = data_json['cure_way']
if 'cure_lasttime' in data_json:
disease_dict['cure_lasttime'] = data_json['cure_lasttime']
if 'cured_prob' in data_json:
disease_dict['cured_prob'] = data_json['cured_prob']
if 'common_drug' in data_json:
common_drug = data_json['common_drug']
for drug in common_drug:
rels_commonddrug.append([disease, drug])
drugs += common_drug
if 'recommand_drug' in data_json:
recommand_drug = data_json['recommand_drug']
drugs += recommand_drug
for drug in recommand_drug:
rels_recommanddrug.append([disease, drug])
if 'not_eat' in data_json:
not_eat = data_json['not_eat']
for _not in not_eat:
rels_noteat.append([disease, _not])
foods += not_eat
do_eat = data_json['do_eat']
for _do in do_eat:
rels_doeat.append([disease, _do])
foods += do_eat
recommand_eat = data_json['recommand_eat']
for _recommand in recommand_eat:
rels_recommandeat.append([disease, _recommand])
foods += recommand_eat
if 'check' in data_json:
check = data_json['check']
for _check in check:
rels_check.append([disease, _check])
checks += check
if 'drug_detail' in data_json:
drug_detail = data_json['drug_detail']
producer = [i.split('(')[0] for i in drug_detail]
rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail]
producers += producer
disease_infos.append(disease_dict)
return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), disease_infos,\
rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,\
rels_symptom, rels_acompany, rels_category
'''建立节点'''
def create_node(self, label, nodes):
# 比较简单
count = 0
for node_name in nodes:
node = Node(label, name=node_name)
self.graph.create(node)
count += 1
print(count, len(nodes))
return
'''创建知识图谱中心疾病的节点'''
def create_diseases_nodes(self, disease_infos):
count = 0
for disease_dict in disease_infos:
node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
prevent=disease_dict['prevent'] ,cause=disease_dict['cause'],
easy_get=disease_dict['easy_get'],cure_lasttime=disease_dict['cure_lasttime'],
cure_department=disease_dict['cure_department']
,cure_way=disease_dict['cure_way'] , cured_prob=disease_dict['cured_prob'])
self.graph.create(node)
count += 1
print(count)
return
'''创建知识图谱实体节点类型schema'''
def create_graphnodes(self):
Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos,rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
self.create_diseases_nodes(disease_infos)
self.create_node('Drug', Drugs)
print(len(Drugs))
self.create_node('Food', Foods)
print(len(Foods))
self.create_node('Check', Checks)
print(len(Checks))
self.create_node('Department', Departments)
print(len(Departments))
self.create_node('Producer', Producers)
print(len(Producers))
self.create_node('Symptom', Symptoms)
return
'''创建实体关系边'''
def create_graphrels(self):
Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
'''创建实体关联边'''
def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
count = 0
# 去重处理
set_edges = []
for edge in edges:
set_edges.append('###'.join(edge))
all = len(set(set_edges))
for edge in set(set_edges):
edge = edge.split('###')
p = edge[0]
q = edge[1]
query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
start_node, end_node, p, q, rel_type, rel_name)
try:
self.graph.run(query)
count += 1
print(rel_type, count, all)
except Exception as e:
print(e)
return
'''导出数据'''
def export_data(self):
Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
f_drug = open('drug.txt', 'w+')
f_food = open('food.txt', 'w+')
f_check = open('check.txt', 'w+')
f_department = open('department.txt', 'w+')
f_producer = open('producer.txt', 'w+')
f_symptom = open('symptoms.txt', 'w+')
f_disease = open('disease.txt', 'w+')
f_drug.write('\n'.join(list(Drugs)))
f_food.write('\n'.join(list(Foods)))
f_check.write('\n'.join(list(Checks)))
f_department.write('\n'.join(list(Departments)))
f_producer.write('\n'.join(list(Producers)))
f_symptom.write('\n'.join(list(Symptoms)))
f_disease.write('\n'.join(list(Diseases)))
f_drug.close()
f_food.close()
f_check.close()
f_department.close()
f_producer.close()
f_symptom.close()
f_disease.close()
return
if __name__ == '__main__':
handler = MedicalGraph()
handler.create_graphnodes()
handler.create_graphrels()
数据导入结果展示
json数据展示如下:
{ "_id" : { "$oid" : "5bb578b6831b973a137e3ee7" }, "name" : "百日咳", "desc" : "百日咳(pertussis,whoopingcough)是由百日咳杆菌所致的急性呼吸道传染病。其特征为阵发性痉挛性咳嗽,咳嗽末伴有特殊的鸡鸣样吸气吼声。病程较长,可达数周甚至3个月左右,故有百日咳之称。多见于5岁以下的小儿,幼婴患本病时易有窒息、肺炎,脑病等并发症,病死率高。百日咳患者,阴性感染者及带菌者为传染源。潜伏期末到病后2-3周传染性最强。百日咳经呼吸道飞沫传播。典型患者病程6-8周,临床病程可分3期:1.卡他期,从发病到开始出现咳嗽,一般1-2周。2,痉咳期,一般2-4周或更长,阵发性痉挛性咳嗽为本期特点。3,恢复期,一般1-2周,咳嗽发作的次数减少,程度减轻,不再出现阵发性痉咳。一般外周血白细胞计数明显增高,分类以淋巴细胞为主。在诊断本病时要注意与支气管异物及肺门淋巴结结核鉴别。近年来幼婴及成人发病有增多趋势。", "category" : [ "疾病百科", "儿科", "小儿内科" ], "prevent" : "1、控制传染源:在流行季节,若有前驱症状应及早抗生素治疗。\n2、切断传播途径:由于百日咳杆菌对外界抵抗力较弱,无需消毒处理,但应保持室内通风,衣物在阳光下曝晒,对痰液及口鼻分泌物则应进行消毒处理。", "cause" : "(一)发病原因\n病原菌是鲍特菌属(Bordetella)中的百日咳鲍特菌(B.pertussis),常称百日咳杆菌,已知鲍特菌属有四种杆菌,除百日咳鲍特菌外还有副百日咳鲍特菌(B.parapertussis),支气管败血鲍特菌(B.bronchiseptica)和鸟型鲍特菌(B.avium),鸟型鲍特菌一般不引起人类致病,仅引起鸟类感染,百日咳杆菌长约1.0~1.5μm,宽约0.3~0.5μm,有荚膜,不能运动,革兰染色阴性,需氧,无芽孢,无鞭毛,用甲苯胺蓝染色两端着色较深,细菌培养需要大量(15%~25%)鲜血才能繁殖良好,故常以鲍-金(Border-Gengous)培养基(即血液,甘油,马铃薯)分离菌落,百日咳杆菌生长缓慢,在35~37℃潮湿的环境中3~7天后,一种细小的,不透明的菌落生长,初次菌落隆起而光滑,为光滑(S)型,又称I相细菌,形态高低一致,有荚膜和较强的毒力及抗原性,致病力强,如将分离菌落在普通培养基中继续培养,菌落由光滑型变为粗糙(R)型,称Ⅳ相细菌,无荚膜,毒力及抗原性丢失,并失去致病力,Ⅱ相,Ⅲ相为中间过渡型,百日咳杆菌能产生许多毒性因子,已知有五种毒素:\n1、百日咳外毒素(PT):是存在百日咳杆菌细胞壁中一种蛋白质,过去称作为白细胞或淋巴细胞增多促进因子(leukocytosis or lymphocyte promoting factor,LPE),组胺致敏因子(histamin sensitizing factor,HSF),胰岛素分泌活性蛋白(insulin activating protein,IAP),百日咳外毒素由五种非共价链亚单位(S1~S5)所组成,亚单位(S2~S5)为无毒性单位,能与宿主细胞膜结合,通过具有酶活力的亚单位S1介导毒性作用,S1能通过腺苷二磷酸(ADP)-核糖转移酶的活力,催化部分ADP-核糖从烟酰胺腺嘌呤二核苷酸(NAD)中分离出来,转移至细胞膜抑制鸟苷三磷酸(CTP)结合即G蛋白合成,导致细胞变生,同时还能促使淋巴细胞增高,活化胰岛细胞及增强免疫应答。\n2、耐热的内毒素(endotoxin,ET),100℃60min只能部分破坏,180℃才能灭活,此毒素能引起机体发热及痉咳。\n3、不耐热毒素(HLT)这种毒素加热55℃30min后能破坏其毒性作用,此毒素抗体对百日咳杆菌感染无保护作用。\n4、气管细胞毒素(TCT):能损害宿主呼吸道纤毛上皮细胞,使之变性,坏死。\n5、腺苷环化酶毒素(ACT):存在百日咳杆菌细胞表面的一种酶,此酶进入吞噬细胞后被调钙蛋白所激活,催化cAMP的生成,干扰吞噬作用,并抑制中性粒细胞的趋化和吞噬细胞杀菌能力,使其能持续感染,ACT也是一种溶血素,能起溶血作用,百日咳的重要抗原是百日咳菌的两种血凝活性抗原,一种为丝状血凝素(filamentous hemagglutinin,FHA),因来自菌体表面菌毛故又称菌毛抗原,FHA在百日咳杆菌黏附于呼吸道上皮细胞的过程中起决定作用,为致病的主要原因。实验发现,FHA免疫小鼠能对抗百日咳杆菌致死性攻击,因此FHA为保护性抗原,另一种凝集原(aggluginogens,AGG)为百日咳杆菌外膜及菌毛中的一种蛋白质成分,主要含1,2,3三种血清型凝血因子,AGG-1具有种特异性;AGG-2,3具有型特异性,通过检测凝集原的型别来了解当地流行情况,目前认为这两种血凝素抗原相应抗体是保护性抗体,百日咳杆菌根据不耐热凝集原抗原性不同分为七型凝集原,1型凝集原为所有百日咳杆菌均具备,7型凝集原为鲍特菌属(包括副百日咳杆菌,支气管败血性杆菌)所共有,2~6型以不同的配合将百日咳杆菌分为不同血清型,测定血清型主要是研究流行时菌株的血清型和选择特殊血清型菌株生产菌苗,此外,副百日咳杆菌与百日咳杆菌无交叉免疫,亦可引起流行,百日咳杆菌对外界理化因素抵抗力弱,55℃经30min即被破坏,干燥数小时即可杀灭,对一般消毒剂敏感,对紫外线抵抗力弱,但在0~10℃存活较长。\n(二)发病机制\n1、发病机制:百日咳发病机制不甚清楚,很可能是百日咳毒素对机体综合作用的结果,当细菌随空气飞沫浸入易感者的呼吸道后,细菌的丝状血凝素黏附于咽喉至细支气管黏膜的纤毛上皮细胞表面;继之,细菌在局部繁殖并产生多种毒素如百日咳外毒素,腺苷环化酶等引起上皮细胞纤毛麻痹和细胞变性,使其蛋白合成降低,上皮细胞坏死脱落,以及全身反应,由于上皮细胞的病变发生和纤毛麻痹使小支气管中黏液及坏死上皮堆聚潴留,分泌物排出受阻,不断刺激呼吸道的周围神经,传入大脑皮质及延髓咳嗽中枢,反射性引起痉挛性咳嗽,由于长期刺激使咳嗽中枢形成兴奋灶,以致非特异性刺激,如进食,咽部检查,冷风,烟雾以及注射疼痛等,均可引起反射性的痉咳,恢复期间亦可因哭泣及其他感染,诱发百日咳样痉咳,近来研究表明百日咳发生机制与百日咳杆菌毒素类物质损害宿主细胞免疫功能有关,CD4+T细胞和Th1细胞分泌的细胞因子所介导的免疫反应,在百日咳杆菌感染中起重要作用。\n2、病理解剖:百日咳杆菌侵犯鼻咽,喉,气管,支气管黏膜,可见黏膜充血,上皮细胞的基底部有多核白细胞,单核细胞浸润及部分细胞坏死。支气管及肺泡周围间质除炎症浸润外,可见上皮细胞胞质空泡形成,甚至核膜破裂溶解,坏死,脱落,但极少波及肺泡。若分泌物阻塞可引起肺不张,支气管扩张,有继发感染者,易发生支气管肺炎,有时可有间质性肺炎;若发生百日咳脑病,镜检或肉眼可见脑组织充血水肿,点状出血,皮质萎缩,神经细胞变性,脑水肿等改变,此时常可见到肝脏脂肪浸润等变化。", "symptom" : [ "吸气时有蝉鸣音", "痉挛性咳嗽", "胸闷", "肺阴虚", "抽搐", "低热", "闫鹏辉", "惊厥" ], "yibao_status" : "否", "get_prob" : "0.5%", "easy_get" : "多见于小儿", "get_way" : "呼吸道传播", "acompany" : [ "肺不张" ], "cure_department" : [ "儿科", "小儿内科" ], "cure_way" : [ "药物治疗", "支持性治疗" ], "cure_lasttime" : "1-2个月", "cured_prob" : "98%", "common_drug" : [ "穿心莲内酯片", "百咳静糖浆" ], "cost_money" : "根据不同医院,收费标准不一致,市三甲医院约(1000-4000元)", "check" : [ "耳、鼻、咽拭子细菌培养", "周围血白细胞计数及分类检验", "血常规", "酶联免疫吸附试验", "白细胞分类计数" ], "do_eat" : [ "南瓜子仁", "圆白菜", "樱桃番茄", "小白菜" ], "not_eat" : [ "螃蟹", "海蟹", "海虾", "海螺" ], "recommand_eat" : [ "清蒸鸡蛋羹", "百合双耳鸡蛋羹", "排骨汤", "罗汉果雪耳鸡汤", "小黄瓜凉拌面", "黄瓜三丝汤", "黄瓜拌兔丝", "黄瓜拌皮丝" ], "recommand_drug" : [ "琥乙红霉素片", "琥乙红霉素颗粒", "百咳静糖浆", "穿心莲内酯片", "红霉素肠溶片", "环酯红霉素片" ], "drug_detail" : [ "惠普森穿心莲内酯片(穿心莲内酯片)", "北京同仁堂百咳静糖浆(百咳静糖浆)", "邦琪药业百咳静糖浆(百咳静糖浆)", "东新药业百咳静糖浆(百咳静糖浆)", "达发新(环酯红霉素片)", "康美药业红霉素肠溶片(红霉素肠溶片)", "旺龙药业琥乙红霉素颗粒(琥乙红霉素颗粒)", "白云山医药琥乙红霉素片(琥乙红霉素片)", "国瑞琥乙红霉素片(琥乙红霉素片)", "利君制药红霉素肠溶片(红霉素肠溶片)", "东信药业琥乙红霉素颗粒(琥乙红霉素颗粒)", "石药欧意红霉素肠溶片(红霉素肠溶片)", "平光制药红霉素肠溶片(红霉素肠溶片)", "北京曙光药业红霉素肠溶片(红霉素肠溶片)", "迪瑞制药琥乙红霉素颗粒(琥乙红霉素颗粒)", "永定制药百咳静糖浆(百咳静糖浆)", "东信药业琥乙红霉素片(琥乙红霉素片)", "利君制药琥乙红霉素片(琥乙红霉素片)", "北京中新制药琥乙红霉素片(琥乙红霉素片)", "华南药业红霉素肠溶片(红霉素肠溶片)", "佐今明百咳静糖浆(百咳静糖浆)", "恒益药业琥乙红霉素颗粒(琥乙红霉素颗粒)", "利君沙(琥乙红霉素颗粒)" ] }
其中我们创建7个节点分别是:药品、食物、检查、科室、药品大类、疾病、症状,我们以疾病和食物进行讲解,我们的疾病是:"百日咳"、我们创建一个节点是百日咳
节点、我们的cause就作为了百日咳
节点的一个属性,其他属性不一一介绍,我们看do_eat和not_eat中的食物:一共建立八个节点分别是:南瓜子仁、圆白菜、樱桃番茄、小白菜、螃蟹、海蟹、海虾、海螺,建立好上面的节点之后我们,我们对疾病和食物节点之间进行关联,就是使用关系将两个节点关系进行关联,以小白菜为例,百日咳
---do_eat--->小白菜
,其他关系的建立方式是一样的。
在整个个构建过程中我们一共构建了: 44112个节点和291165个关系。
- 实体类型:
- 关系类型:
- 属性类型
效果如下:
命令:
Match (n:Disease {name:"百日咳"}) return n
单独查询关系:
MATCH (a:Disease {name:"百日咳"})-[:do_eat]->(b) RETURN a,b
在最后的位置有个export_data,我们的dict中有数据就不用运行,若不存在数据则必须运行。