【大数据】Neo4j图数据库导入漏洞数据

318 阅读4分钟

image.png

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第18天,点击查看活动详情


前言

昨天的文章中,使用python程序进行了数据库的连接,以及基本操作,并且分别测试了通过python脚本进行数据库的连接、节点的创建、关系的创建、节点的删除、修改等,并且完成了一个小的工具类,而在今天的文章中,将对之前漏洞数据数据构建知识图谱。

数据处理

之前采集的数据格式如下:

{
  'content': 'a null pointer dereference issue was found in kvm when releasing a vcpu with dirty ring support enabled. this flaw allows an unprivileged local attacker on the host to issue specific ioctl calls, causing a kernel oops condition that results in a denial of service.',
  'company': [
    'amazon_2',
    'ubuntu_22.04'
  ],
  'product': [
    'kernel',
    'linux'
  ],
  'version': [
    '*',
    '*'
  ],
  'influence': [
    '5.15.43',
    '20.123.amzn2',
    '5.15.0',
    '35.36'
  ],
  'type': '系统',
  'cve_number': 'CVE-2022-1263',
  'title': '空标题',
  'href': 'https://avd.aliyun.com/detail?id=AVD-2022-1263'
}

其中需要构建的节点包括公司【company】、产品【product】、版本【version】、影响版本【influence】、CVE编号【cve_number】、漏洞类型【type】。

本着先构建节点再联线建关系的原则,首先要做的就是处理数据。将节点剥离出来。

查看之前的数据格式,发现产品、应用、与版本对应不上,因此重新改变了一下数据格式,如下:

{
  'content': '暂无',
  'data': [
    [
      '系统',
      'amazon_2',
      'kernel',
      '*',
      'Up',
      'to',
      '(excluding)',
      '5.15.50-23.125.amzn2'
    ]
  ],
  'solution': '建议您更新当前系统或软件至最新版,完成漏洞的修复。',
  'cve_number': 'CVE-2022-3449',
  'title': 'Browsing',
  'href': 'https://avd.aliyun.com/detail?id=AVD-2022-3449'
}

数据处理代码以及插入代码如下:

import codecs
from connectUtils import connect2Neo4J

def cleanData(tempList):
    returnList = []
    for item in tempList:
        itemDict = {}
        type = item[0]
        company = item[1]
        product = item[2]
        version = item[3]
        influence = ' '.join(item[4:]).replace("From"," ").replace("(including)"," ").replace("(excluding)"," ").replace("Up to"," ").replace("-"," ").split()
        influence = '-'.join([item for item in influence if item != ""])
        itemDict["type"] = type
        itemDict["company"] = company
        itemDict["product"] = product
        itemDict["version"] = version
        itemDict["influence"] = influence
        returnList.append(itemDict)
    return returnList


def loadData():
    lines = codecs.open("aliyunSpider.txt",'r','utf-8').readlines()
    for line in lines:
        try:
            json_line = eval(line.strip())
            print(json_line["solution"])
            print(cleanData(json_line["data"]))
            cve_number = (json_line["cve_number"])
            print(json_line["href"])

            for item in cleanData(json_line["data"]):
                type = item["type"]
                company = item["company"]
                product = item["product"]
                version = item["version"]
                influence = item["influence"]

                print(company)

                if (connect.conn.nodes.match("company",company).first()):
                    companyNode = connect.conn.nodes.match("company",company).first()
                    if (connect.conn.nodes.match("product",product).first()):
                        productNode = connect.conn.nodes.match("product",product).first()
                        connect.creatRelationship(companyNode, productNode, "拥有")
                    else:
                        productNode = connect.creatNode("product", product)
                        connect.creatRelationship(companyNode, productNode, "拥有")


                    if (connect.conn.nodes.match("cvenumberNode",cve_number).first()):
                        cvenumberNode = connect.conn.nodes.match("cvenumberNode",cve_number).first()
                        connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                        versionNode = connect.creatNode("versionNode", influence)
                        typeNode = connect.creatNode("type", type)
                        connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                        connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                    else:
                        cvenumberNode = connect.creatNode("cvenumberNode", cve_number)
                        connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                        versionNode = connect.creatNode("versionNode", influence)
                        typeNode = connect.creatNode("type", type)
                        connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                        connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                else:
                    companyNode = connect.creatNode("company",company)
                    productNode = connect.creatNode("product", product)
                    cvenumberNode = connect.creatNode("cvenumberNode", cve_number)
                    versionNode = connect.creatNode("versionNode", influence)
                    typeNode = connect.creatNode("type",type)
                    connect.creatRelationship(companyNode,productNode,"拥有")
                    connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                    connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                    connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                print(type)
                print(product)
                print(version)
                print(influence)
        except Exception as e:
            print(e)


if __name__ == '__main__':
    connect = connect2Neo4J()
    loadData()

connectUtils.py neo4J数据库连接的公共类如下:

from py2neo import Graph, Node, Relationship, NodeMatcher, NodeMatch,RelationshipMatcher


class connect2Neo4J():
    def __init__(self):
        self.userName = "neo4j"
        self.password = ""
        self.neo4jUrl = 'http://127.0.0.1:17474'
        self.conn = Graph(self.neo4jUrl, auth=(self.userName, self.password))

    def creatNode(self, labels, properties):
        insertNode = Node(labels, name=properties)
        self.conn.create(insertNode)
        return insertNode

    def creatRelationship(self, nodeA, nodeB, properties):
        relation = Relationship(nodeA, properties, nodeB)
        self.conn.create(relation)

    def searchNode(self, labels, limit=None):
        if limit:
            data = NodeMatch(self.conn, labels=frozenset({'{}'.format(labels)})).limit(limit)
        else:
            node_matcher = NodeMatcher(self.conn)
            data = node_matcher.match(labels)
        return data


    def getRelations(self,nodes = None,r_type = None,limit = None):
        relation = RelationshipMatcher(self.conn)
        relationData = relation.match(nodes=nodes, r_type='{}'.format(r_type)).limit(limit)
        return relationData

其中connect = connect2Neo4J()为初始化公共类,加载我们抓取的数据,使用eval()方法将数据转为dict格式,根据每一条样本,分别提取其中的公司【company】、产品【product】、版本【version】、影响版本【influence】、CVE编号【cve_number】、漏洞类型【type】等信息,并存储于neo4j数据库,大致思路为,首先判断公司节点是否存在于数据库内,如果存在,变加载节点,生成与软件之间的关系,软件与CVE编号的关系,CVE编号与软件版本、漏洞类型的关系,最后是以不同公司为核心侯建关系网络。但是实际使用发现如果节点存在,并没有在原有节点基础上建立新的联系,而是创建了新的节点,如下图所示:

image.png

我们发现同一个公司有两个节点,这个不符合我们的预期

image.png

因此清空数据库重建:

match (n) detach delete n

image.png

程序中打断点发现:

connect.conn.nodes.match("company",company).first()

这个好像没有生效,都是None,因此改用NodeMatcher,代码如下:

node_matcher = NodeMatcher(connect.conn)
data = node_matcher.match("company").where(name=company)
print(list(data))

修改后的代码如下:

import codecs

from py2neo import NodeMatcher

from connectUtils import connect2Neo4J

def cleanData(tempList):
    returnList = []
    for item in tempList:
        itemDict = {}
        type = item[0]
        company = item[1]
        product = item[2]
        version = item[3]
        influence = ' '.join(item[4:]).replace("From"," ").replace("(including)"," ").replace("(excluding)"," ").replace("Up to"," ").replace("-"," ").split()
        influence = '-'.join([item for item in influence if item != ""])
        itemDict["type"] = type
        itemDict["company"] = company
        itemDict["product"] = product
        itemDict["version"] = version
        itemDict["influence"] = influence
        returnList.append(itemDict)
    return returnList


def loadData():
    lines = codecs.open("aliyunSpider.txt",'r','utf-8').readlines()
    node_matcher = NodeMatcher(connect.conn)
    for line in lines:
        try:
            json_line = eval(line.strip())
            print(json_line["solution"])
            print(cleanData(json_line["data"]))
            cve_number = (json_line["cve_number"])
            print(json_line["href"])

            for item in cleanData(json_line["data"]):
                type = item["type"]
                company = item["company"]
                product = item["product"]
                version = item["version"]
                influence = item["influence"]
                if influence == "":
                    influence = "未知"

                print(company)
                companyData = node_matcher.match("company").where(name=company)
                print(list(companyData))
                if (len(list(companyData)) != 0):
                    companyNode = companyData.first()
                    productData = node_matcher.match("product").where(name=product)
                    if (len(list(productData)) != 0):
                        productNode = productData.first()
                        connect.creatRelationship(companyNode, productNode, "拥有")
                    else:
                        productNode = connect.creatNode("product", product)
                        connect.creatRelationship(companyNode, productNode, "拥有")

                    cveData = node_matcher.match("cvenumberNode").where(name=cve_number)
                    if (len(list(cveData)) != 0):
                        cvenumberNode = cveData.first()
                        connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                        versionNode = connect.creatNode("versionNode", influence)
                        typeNode = connect.creatNode("type", type)
                        connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                        connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                    else:
                        cvenumberNode = connect.creatNode("cvenumberNode", cve_number)
                        connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                        versionNode = connect.creatNode("versionNode", influence)
                        typeNode = connect.creatNode("type", type)
                        connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                        connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                else:
                    companyNode = connect.creatNode("company",company)
                    productNode = connect.creatNode("product", product)
                    cvenumberNode = connect.creatNode("cvenumberNode", cve_number)
                    versionNode = connect.creatNode("versionNode", influence)
                    typeNode = connect.creatNode("type",type)
                    connect.creatRelationship(companyNode,productNode,"拥有")
                    connect.creatRelationship(productNode,cvenumberNode,"暴露漏洞")
                    connect.creatRelationship(cvenumberNode, typeNode, "漏洞类型")
                    connect.creatRelationship(cvenumberNode, versionNode, "影响版本")
                print(type)
                print(product)
                print(version)
                print(influence)
        except Exception as e:
            print(e)


if __name__ == '__main__':
    connect = connect2Neo4J()
    loadData()

主要就是修改了查询节点的代码,修改之后看效果还可以,但是还是有冗余节点,之后再进行修改

image.png

Thanks♪(・ω・)ノ