【NLP】漏洞类情报信息抽取-- 模型预测代码

344 阅读9分钟

image.png

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第6天,点击查看活动详情

前言

在昨天的记录中,更新了训练部分的代码以及模型训练过程,经过训练获得了收敛的模型,今天的记录中将会记录模型的测试代码以及测试代码解读

模型测试

使用tensorflow训练模型后,会根据用户设定的步长存储checkpoint,例如每一个epoch后,或者每3000个step等,checkpoint即模型保存时刻的全部参数值的集合,使用模型的时候,首先构建模型graph,之后加载参数,并且启动session,当有新的预测数据送入模型的时候,需要经过同样的数据处理流程,方可被模型读取并预测结果,代码如下:

import codecs
import jieba
import tensorflow as tf
import numpy as np
import os, argparse, time, random
from model import BiLSTM_CRF
from utils import str2bool, get_logger
from data_helper import read_dictionary, random_embedding, read_files ,read_tag_id
from data_helper import pickle_reader
config = tf.ConfigProto()

config.gpu_options.allow_growth = True

parser = argparse.ArgumentParser(description='BiLSTM-CRF for Chinese NER task')  # 参数描述
parser.add_argument('--data_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\NerData\train.txt',
                    help='train data source')
parser.add_argument('--word2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\word_dic.pkl',
                    help='word2id source')
parser.add_argument('--tag2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl',
                    help='word2id source')
parser.add_argument('--save_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\data_path_save',
                    help='test data source')

parser.add_argument('--batch_size', type=int, default=32, help='#sample of each minibatch')
parser.add_argument('--epoch', type=int, default=30, help='#epoch of training')
parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state')
parser.add_argument('--optimizer', type=str, default='Adam',
                    help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True,
                    help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True,
                    help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random',
                    help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300,
                    help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='test', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1662621164',
                    help='model for test and demo')
args = parser.parse_args()

word2id = read_dictionary(args.word2id)
tag2label = read_tag_id(args.tag2id)


if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join(args.save_path, timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

ckpt_file = tf.train.latest_checkpoint(model_path)
print(ckpt_file)
paths['model_path'] = ckpt_file
model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
model.build_graph()
saver = tf.train.Saver()

label2tag = {v:k for k,v in  pickle_reader(r"E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl").items()}

with tf.Session(config=config) as sess:
    saver.restore(sess, ckpt_file)
    total = []
    textTemp = []
    print("请输入原始文本 ---  >>>>>>>>>>>>>>>>>>>>>>")
    sentence = input().lower()
    print(len(total))
    total.append([list(jieba.cut(sentence))])
    for sentence in total:
        demo_sent = sentence[0]
        demo_data = [[demo_sent], [['O'] * len(demo_sent)],[len(demo_sent)]]
        tag = model.demo_one(sess, demo_data)
        res = []
        for t in tag:
            if t == 0:
                res.append("O")
            else:
                res.append(t)
        content = (sentence[0])
        resDic = {"company":[],"product":[],"version":[],"cve":[]}
        for index,value in enumerate(res):
            if str(value).startswith("B"):
                type = value.split("_")[1]
                pointer = index
                while pointer < len(res):
                    if res[pointer] == "O":
                        resDic[type].append([' '.join(content[index:pointer]), (index,pointer)])
                        break
                    if res[pointer].split("_")[1] != type:
                        resDic[type].append([' '.join(content[index:pointer]), (index, pointer)])
                        break
                    else:
                        pointer += 1
            else:
                pass
        print(tag)
        print(resDic)
        print(sentence[0])
        print("++++++++++++++++++++++")

预测代码解读

config.gpu_options.allow_growth = True

表示不会占满显存,目的在于多用户共用GPU资源的时候,避免单一代码占满GPU资源导致其他用户无资源可用。

其他参数解读:

  • data_path 表示训练数据路径
  • word2id 表示字符的索引值,为之前代码生成的字符-索引文件,即word_dic.pkl
  • tag2id 表示标签-索引文件
  • save_path 表示模型保存路径
  • batch_size 批次大小,训练的每一个step馈送入模型的数量条目
  • epoch 模型训练轮数,模型使用全部的训练数据全部训练完毕为一轮,一共训练多少轮
  • hidden_dim 隐层大小,LSTM模型使用的隐层单元数量,数量越多,模型参数量越大
  • optimizer 优化器选择,默认使用adam
  • CRF 表示是否使用CRF作为序列预测
  • lr 表示使用的学习率,一般使用0.001等浮点数作为学习率,用于梯度计算,反向传播更新参数
  • clip 梯度阈值 避免反向传播过程中的梯度爆炸 当梯度超过梯度阈值的时候直接进行剪裁
  • dropout 神经网络随机失活比例,用于减少过拟合
  • update_embedding 是否更新embedding矩阵
  • pretrain_embedding 是否使用预训练词向量
  • embedding_dim embedding矩阵维度
  • shuffle 是否随机打乱数据集
  • mode 选择训练、测试还是预测
  • demo_model 测试模型的checkpoint

同样需要加载数据处理过程中的token词典和label词典,两个词典的作用分别是将对应的句子分词好的token映射为token index,用于embedding过程中将字符token index转为词向量用于模型计算,而标签词典则用于模型预测后将对应的标签索引转换为具体的标签值,如B_company等。embedding虽然会重新随机初始化一个并以参数的形式传入模型,但是当加载checkpoint后,embedding值会更新为训练后的浮点值,并不是使用随机初始化的值。

代码如下:

word2id = read_dictionary(args.word2id)
tag2label = read_tag_id(args.tag2id)


if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')

以下部分是明确了部分路径:

paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join(args.save_path, timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

ckpt_file = tf.train.latest_checkpoint(model_path)
print(ckpt_file)
paths['model_path'] = ckpt_file

其中tf.train.latest_checkpoint()函数的作用查找最新保存的checkpoint文件的文件名(Finds the filename of latest saved checkpoint file.)。

def latest_checkpoint(checkpoint_dir, latest_filename=None):
  """Finds the filename of latest saved checkpoint file.

  Args:
    checkpoint_dir: Directory where the variables were saved.
    latest_filename: Optional name for the protocol buffer file that
      contains the list of most recent checkpoint filenames.
      See the corresponding argument to `Saver.save()`.

  Returns:
    The full path to the latest checkpoint or `None` if no checkpoint was found.
  """
  # Pick the latest checkpoint based on checkpoint state.
  ckpt = get_checkpoint_state(checkpoint_dir, latest_filename)
  if ckpt and ckpt.model_checkpoint_path:
    # Look for either a V2 path or a V1 path, with priority for V2.
    v2_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V2)
    v1_path = _prefix_to_checkpoint_path(ckpt.model_checkpoint_path,
                                         saver_pb2.SaverDef.V1)
    if file_io.get_matching_files(v2_path) or file_io.get_matching_files(
        v1_path):
      return ckpt.model_checkpoint_path
    else:
      logging.error("Couldn't match files for checkpoint %s",
                    ckpt.model_checkpoint_path)
  return None

注意该函数包含两个参数,一个是checkpoint_dir,一个是latest_filename,注意传入的是checkpoint所在的文件夹路径,而不是具体的checkpoint文件名称。使用tensorflow保存模型后,会生成如下文件:

image.png

第一个文件model.ckpt.data-00000-of-00001与model.ckpt.index保存了所有变量的取值,model.ckpt.meta,保存了 Tensorflow 计算图的结构,即神经网络的结构。而checkpoint文件中保存了如下内容:

model_checkpoint_path: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-14236"
all_model_checkpoint_paths: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-13036"
all_model_checkpoint_paths: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-13336"
all_model_checkpoint_paths: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-13636"
all_model_checkpoint_paths: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-13936"
all_model_checkpoint_paths: "E:\project\Vlun_NER_LSTM\data_path_save\1662621164\checkpoints/model-14236"

使用tf.train.latest_checkpoint()则加载文件中的最后一条,即:model-14236。

model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
model.build_graph()
saver = tf.train.Saver()

model = BiLSTM_CRF()与model.build_graph()重新构建模型的graph。 saver = tf.train.Saver()创建saver对象。

label2tag = {v:k for k,v in  pickle_reader(r"E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl").items()}

构建一个label -> tag的map,用于将预测的label_index -> label.

with tf.Session(config=config) as sess:
    saver.restore(sess, ckpt_file)

在tensorflow中,变量是存在于Session环境中,因此只有在Session环境下才会有变量的值,因此恢复模型的时候需要传入session。

total = []
textTemp = []
print("请输入原始文本 ---  >>>>>>>>>>>>>>>>>>>>>>")
sentence = input().lower()
print(len(total))
total.append([list(jieba.cut(sentence))])
for sentence in total:
    demo_sent = sentence[0]
    demo_data = [[demo_sent], [['O'] * len(demo_sent)],[len(demo_sent)]]
    tag = model.demo_one(sess, demo_data)
    res = []
    for t in tag:
        if t == 0:
            res.append("O")
        else:
            res.append(t)
    content = (sentence[0])
    resDic = {"company":[],"product":[],"version":[],"cve":[]}
    for index,value in enumerate(res):
        if str(value).startswith("B"):
            type = value.split("_")[1]
            pointer = index
            while pointer < len(res):
                if res[pointer] == "O":
                    resDic[type].append([' '.join(content[index:pointer]), (index,pointer)])
                    break
                if res[pointer].split("_")[1] != type:
                    resDic[type].append([' '.join(content[index:pointer]), (index, pointer)])
                    break
                else:
                    pointer += 1
        else:
            pass
    print(tag)
    print(resDic)
    print(sentence[0])
    print("++++++++++++++++++++++")

后续代码则属对于输入的一句话,通过jieba进行分词,通过模型预测序列标签,并打印预测结果,此时模型的预测代码结束。

以上是漏洞类情报抽取的全部内容,附上全部代码:

data_helper.py

import pickle
import numpy as np
import codecs
import random

def pickle_writer(inputs, name):
    output = open(name, 'wb')
    pickle.dump(inputs, output, protocol=2)
    output.close()
    print("Finish save {}".format(name))


def pickle_reader(inputs):
    f = open(inputs, 'rb')
    lines = pickle.load(f)
    f.close()
    print("Finish load {}".format(inputs))
    return lines


def read_dictionary(path):
    return pickle_reader(path)


def read_tag_id(path):
    return pickle_reader(path)


def random_embedding(vocab, embedding_dim):
    embedding_mat = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    embedding_mat = np.float32(embedding_mat)
    return embedding_mat


def read_files(data_path):
    temp = []
    lines = []
    label = []
    labels = []
    for line in codecs.open(data_path, 'r', 'UTF-8'):
        if len(line.strip().split('\t')) < 2:
            if temp and label:
                lines.append(temp)
                labels.append(label)
            temp = []
            label = []
        else:
            temp.append(line.strip().split('\t')[0])
            label.append(line.strip().split('\t')[1])
    seq_length = max([len(line) for line in lines])
    return lines, labels, seq_length



def sentence2id(sent, word2id):
    sentence_id = []
    for word in sent:
        if word not in word2id:
            word = '[UNK]'
        sentence_id.append(word2id[word])
    return sentence_id


def batch_yield(data, batch_size, vocab, tag2label, shuffle=False):
    temp = []
    for i in range(len(data[0])):
        temp.append([data[0][i],data[1][i]])
    temp = np.array(temp)
    if shuffle:
        random.shuffle(temp)
    seqs, labels = [], []
    for (sent_, tag_) in temp:
        sent_ = sentence2id(sent_, vocab)
        label_ = [tag2label[tag] for tag in tag_]
        if len(seqs) == batch_size:
            yield seqs, labels
            seqs, labels = [], []
        seqs.append(sent_)
        labels.append(label_)
    if len(seqs) != 0:
        yield seqs, labels



def pad_sequences(sequences, pad_mark=0):
    max_len = max(map(lambda x : len(x), sequences))
    seq_list, seq_len_list = [], []
    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_len] + [pad_mark] * max(max_len - len(seq), 0)
        seq_list.append(seq_)
        seq_len_list.append(min(len(seq), max_len))
    return seq_list, seq_len_list

model.py

import numpy as np
import time
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.contrib.crf import crf_log_likelihood
from tensorflow.contrib.crf import viterbi_decode
from data_helper import batch_yield ,pad_sequences
from utils import get_logger


class BiLSTM_CRF(object):
    def __init__(self, args, embeddings, tag2label, vocab, paths, config):
        # 模型初始化
        self.batch_size = args.batch_size
        self.epoch_num = args.epoch
        self.hidden_dim = args.hidden_dim
        self.embeddings = embeddings
        self.CRF = args.CRF
        self.update_embedding = args.update_embedding
        self.dropout_keep_prob = args.dropout
        self.optimizer = args.optimizer
        self.lr = args.lr
        self.clip_grad = args.clip
        self.tag2label = tag2label
        self.num_tags = len(tag2label)
        self.vocab = vocab
        self.shuffle = args.shuffle
        self.model_path = paths['model_path']
        self.summary_path = paths['summary_path']
        self.logger = get_logger(paths['log_path'])
        self.result_path = paths['result_path']
        self.config = config

    def build_graph(self):
        # 模型构建
        self.add_placeholders()  # 占位符初始化
        self.lookup_layer_op()   # lookup_layer初始化  用于word_id -> embediing
        self.biLSTM_layer_op()   # biLSTM_layer初始化  用于sentence encoder
        self.softmax_pred_op()   # softmax_pred初始化  用于CRF
        self.loss_op()           # loss初始化
        self.trainstep_op()      # train函数初始化
        self.init_op()           # 模型初始化

    def add_placeholders(self):
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids")
        self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths")

        self.dropout_pl = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
        self.lr_pl = tf.placeholder(dtype=tf.float32, shape=[], name="lr")

    def lookup_layer_op(self):
        with tf.variable_scope("words"):
            _word_embeddings = tf.Variable(self.embeddings,
                                           dtype=tf.float32,
                                           trainable=self.update_embedding,
                                           name="_word_embeddings")
            word_embeddings = tf.nn.embedding_lookup(params=_word_embeddings,
                                                     ids=self.word_ids,
                                                     name="word_embeddings")
        self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout_pl)

    def biLSTM_layer_op(self):
        with tf.variable_scope("bi-lstm"):
            # 使用双向LSTM作为网络单元
            cell_fw = LSTMCell(self.hidden_dim)
            cell_bw = LSTMCell(self.hidden_dim)
            # 前向输出 后向输出 ,末态
            (output_fw_seq, output_bw_seq), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=self.word_embeddings,
                sequence_length=self.sequence_lengths,
                dtype=tf.float32)
            # 前向后向相连
            output = tf.concat([output_fw_seq, output_bw_seq], axis=-1)
            # 增加dropout 防止过拟合
            output = tf.nn.dropout(output, self.dropout_pl)
        # 以下为全连接操作
        with tf.variable_scope("proj"):
            W = tf.get_variable(name="W",
                                shape=[2 * self.hidden_dim, self.num_tags],
                                initializer=tf.contrib.layers.xavier_initializer(),
                                dtype=tf.float32)

            b = tf.get_variable(name="b",
                                shape=[self.num_tags],
                                initializer=tf.zeros_initializer(),
                                dtype=tf.float32)

            s = tf.shape(output)
            output = tf.reshape(output, [-1, 2*self.hidden_dim])
            pred = tf.matmul(output, W) + b

            self.logits = tf.reshape(pred, [-1, s[1], self.num_tags])

    def loss_op(self):
        # 使用CRF进行预测
        if self.CRF:
            log_likelihood, self.transition_params = crf_log_likelihood(inputs=self.logits,
                                                                   tag_indices=self.labels,
                                                                   sequence_lengths=self.sequence_lengths)
            self.loss = -tf.reduce_mean(log_likelihood)

        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits,
                                                                    labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        tf.summary.scalar("loss", self.loss)

    def softmax_pred_op(self):
        if not self.CRF:
            self.labels_softmax_ = tf.argmax(self.logits, axis=-1)
            self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32)

    def trainstep_op(self):
        # 训练配置
        with tf.variable_scope("train_step"):
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            if self.optimizer == 'Adam':
                optim = tf.train.AdamOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Adadelta':
                optim = tf.train.AdadeltaOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Adagrad':
                optim = tf.train.AdagradOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'RMSProp':
                optim = tf.train.RMSPropOptimizer(learning_rate=self.lr_pl)
            elif self.optimizer == 'Momentum':
                optim = tf.train.MomentumOptimizer(learning_rate=self.lr_pl, momentum=0.9)
            elif self.optimizer == 'SGD':
                optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl)
            else:
                optim = tf.train.GradientDescentOptimizer(learning_rate=self.lr_pl)

            grads_and_vars = optim.compute_gradients(self.loss)
            grads_and_vars_clip = [[tf.clip_by_value(g, -self.clip_grad, self.clip_grad), v] for g, v in grads_and_vars]
            self.train_op = optim.apply_gradients(grads_and_vars_clip, global_step=self.global_step)

    def init_op(self):
        self.init_op = tf.global_variables_initializer()

    def add_summary(self, sess):
        """

        :param sess:
        :return:
        """
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.summary_path, sess.graph)

    def train(self, train_data, dev_data, train_label, dev_label):
        saver = tf.train.Saver(tf.global_variables())
        with tf.Session(config=self.config) as sess:
            sess.run(self.init_op)
            self.add_summary(sess)
            for epoch in range(self.epoch_num):
                self.run_one_epoch(sess, [train_data, train_label], [dev_data, dev_label], epoch, saver)


    def test(self, test):
        saver = tf.train.Saver()
        with tf.Session(config=self.config) as sess:
            self.logger.info('=========== testing ===========')
            saver.restore(sess, self.model_path)
            label_list, seq_len_list = self.dev_one_epoch(sess, test)
            self.evaluate(label_list, seq_len_list, test)

    def demo_one(self, sess, sent):
        """

        :param sess:
        :param sent: 
        :return:
        """
        label_list = []
        for seqs, labels in batch_yield(sent, 1, self.vocab, self.tag2label, shuffle=False):
            label_list_, _ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag if label != 0 else label
        tag = [label2tag[label] for label in label_list[0]]
        return tag

    def run_one_epoch(self, sess, train, dev, epoch, saver):
        train_length = np.array(train).shape[1]
        num_batches = (train_length + self.batch_size - 1) // self.batch_size
        print('num_batches :{}'.format(num_batches))
        start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        batches = batch_yield(train, self.batch_size, self.vocab, self.tag2label, shuffle=self.shuffle)
        for step, (seqs, labels) in enumerate(batches):
            # print(' processing: {} batch / {} batches.'.format(step + 1, num_batches) + '\r')
            step_num = epoch * num_batches + step + 1
            feed_dict, _ = self.get_feed_dict(seqs, labels, self.lr, self.dropout_keep_prob)
            _, loss_train, summary, step_num_ = sess.run([self.train_op, self.loss, self.merged, self.global_step],feed_dict=feed_dict)
            if step + 1 == 1 or (step + 1) % 300 == 1 or step + 1 == num_batches:
                print('{} epoch {}, step {}, loss: {:.4}, global_step: {}'.format(start_time, epoch + 1, step + 1,loss_train, step_num))
                self.file_writer.add_summary(summary, step_num)
                saver.save(sess, self.model_path, global_step=step_num)
                print('===========validation / test===========')
                label_list_dev, seq_len_list_dev = self.dev_one_epoch(sess, dev)
                self.evaluate(label_list_dev, seq_len_list_dev, dev, epoch)


    def get_feed_dict(self, seqs, labels=None, lr=None, dropout=None):
        """

        :param seqs:
        :param labels:
        :param lr:
        :param dropout:
        :return: feed_dict
        """
        word_ids, seq_len_list = pad_sequences(seqs, pad_mark=0)

        feed_dict = {self.word_ids: word_ids,
                     self.sequence_lengths: seq_len_list}
        if labels is not None:
            labels_, _ = pad_sequences(labels, pad_mark=0)
            feed_dict[self.labels] = labels_
        if lr is not None:
            feed_dict[self.lr_pl] = lr
        if dropout is not None:
            feed_dict[self.dropout_pl] = dropout

        return feed_dict, seq_len_list

    def dev_one_epoch(self, sess, dev):
        """

        :param sess:
        :param dev:
        :return:
        """
        label_list, seq_len_list = [], []
        for seqs, labels in batch_yield(dev, self.batch_size, self.vocab, self.tag2label, shuffle=False):
            label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
            seq_len_list.extend(seq_len_list_)
        return label_list, seq_len_list

    def predict_one_batch(self, sess, seqs):
        """

        :param sess:
        :param seqs:
        :return: label_list
                 seq_len_list
        """
        feed_dict, seq_len_list = self.get_feed_dict(seqs, dropout=1.0)

        if self.CRF:
            logits, transition_params = sess.run([self.logits, self.transition_params],
                                                 feed_dict=feed_dict)
            label_list = []
            for logit, seq_len in zip(logits, seq_len_list):
                viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_params)
                label_list.append(viterbi_seq)
            return label_list, seq_len_list

        else:
            label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict)
            return label_list, seq_len_list

    def evaluate(self, label_list, seq_len_list, data, epoch=None):
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag
        label = data[1]
        total = 0
        true = 0
        for index ,item in enumerate(label_list):
            predict_result = [label2tag[label_] for label_ in item]
            ground_truth = label[index]
            assert len(predict_result) == len(ground_truth)
            total += len(predict_result)
            for index,item in enumerate(ground_truth):
                if ground_truth[index] == predict_result[index]:
                    true += 1
        print('Evaluate accuracy is :{}'.format(true/total))

main.py

import tensorflow as tf
import numpy as np
import os, argparse, time, random
from model import BiLSTM_CRF
from utils import str2bool, get_logger
from data_helper import read_dictionary, random_embedding, read_files ,read_tag_id
config = tf.ConfigProto()

config.gpu_options.allow_growth = True

parser = argparse.ArgumentParser(description='BiLSTM-CRF for Chinese NER task')  # 参数描述
parser.add_argument('--data_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\NerData\train.txt',
                    help='train data source')
parser.add_argument('--word2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\word_dic.pkl',
                    help='word2id source')
parser.add_argument('--tag2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl',
                    help='word2id source')
parser.add_argument('--save_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\data_path_save',
                    help='test data source')

parser.add_argument('--batch_size', type=int, default=32, help='#sample of each minibatch')
parser.add_argument('--epoch', type=int, default=30, help='#epoch of training')
parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state')
parser.add_argument('--optimizer', type=str, default='Adam',
                    help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True,
                    help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True,
                    help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random',
                    help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300,
                    help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='test', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1662621164',
                    help='model for test and demo')
args = parser.parse_args()

word2id = read_dictionary(args.word2id)
tag2label = read_tag_id(args.tag2id)


if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


# -----------------read data--------------------
lines, label, seq_length = read_files(args.data_path)
assert len(lines) == len(label)
index = int(len(lines) * 0.9)
train_data, dev_data = lines[:index], lines[index:]
train_label, dev_label = label[:index],  label[index:]


paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join(args.save_path, timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

if args.mode == 'train':
    model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
    model.build_graph()
    print("train data: {}".format(len(train_data)))
    model.train(train_data, dev_data, train_label, dev_label)

## testing model
elif args.mode == 'test':
    test_data = read_files(r"E:\project\Vlun_NER_LSTM\NerData\test.txt")
    ckpt_file = tf.train.latest_checkpoint(model_path)
    print(ckpt_file)
    paths['model_path'] = ckpt_file
    model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
    model.build_graph()
    print("test data: {}".format(test_data))
    model.test(test_data)

utils.py

import logging, sys, argparse


def str2bool(v):
    # copy from StackOverflow
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def get_entity(tag_seq, char_seq):
    PER = get_PER_entity(tag_seq, char_seq)
    LOC = get_LOC_entity(tag_seq, char_seq)
    ORG = get_ORG_entity(tag_seq, char_seq)
    return PER, LOC, ORG


def get_PER_entity(tag_seq, char_seq):
    length = len(char_seq)
    PER = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-PER':
            if 'per' in locals().keys():
                PER.append(per)
                del per
            per = char
            if i+1 == length:
                PER.append(per)
        if tag == 'I-PER':
            per += char
            if i+1 == length:
                PER.append(per)
        if tag not in ['I-PER', 'B-PER']:
            if 'per' in locals().keys():
                PER.append(per)
                del per
            continue
    return PER


def get_LOC_entity(tag_seq, char_seq):
    length = len(char_seq)
    LOC = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-LOC':
            if 'loc' in locals().keys():
                LOC.append(loc)
                del loc
            loc = char
            if i+1 == length:
                LOC.append(loc)
        if tag == 'I-LOC':
            loc += char
            if i+1 == length:
                LOC.append(loc)
        if tag not in ['I-LOC', 'B-LOC']:
            if 'loc' in locals().keys():
                LOC.append(loc)
                del loc
            continue
    return LOC


def get_ORG_entity(tag_seq, char_seq):
    length = len(char_seq)
    ORG = []
    for i, (char, tag) in enumerate(zip(char_seq, tag_seq)):
        if tag == 'B-ORG':
            if 'org' in locals().keys():
                ORG.append(org)
                del org
            org = char
            if i+1 == length:
                ORG.append(org)
        if tag == 'I-ORG':
            org += char
            if i+1 == length:
                ORG.append(org)
        if tag not in ['I-ORG', 'B-ORG']:
            if 'org' in locals().keys():
                ORG.append(org)
                del org
            continue
    return ORG


def get_logger(filename):
    logger = logging.getLogger('logger')
    logger.setLevel(logging.DEBUG)
    logging.basicConfig(format='%(message)s', level=logging.DEBUG)
    handler = logging.FileHandler(filename)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)
    return logger

inference_input.py

import codecs
import jieba
import tensorflow as tf
import numpy as np
import os, argparse, time, random
from model import BiLSTM_CRF
from utils import str2bool, get_logger
from data_helper import read_dictionary, random_embedding, read_files ,read_tag_id
from data_helper import pickle_reader
config = tf.ConfigProto()

config.gpu_options.allow_growth = True

parser = argparse.ArgumentParser(description='BiLSTM-CRF for Chinese NER task')  # 参数描述
parser.add_argument('--data_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\NerData\train.txt',
                    help='train data source')
parser.add_argument('--word2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\word_dic.pkl',
                    help='word2id source')
parser.add_argument('--tag2id', type=str, default=r'E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl',
                    help='word2id source')
parser.add_argument('--save_path', type=str,
                    default=r'E:\project\Vlun_NER_LSTM\data_path_save',
                    help='test data source')

parser.add_argument('--batch_size', type=int, default=32, help='#sample of each minibatch')
parser.add_argument('--epoch', type=int, default=30, help='#epoch of training')
parser.add_argument('--hidden_dim', type=int, default=300, help='#dim of hidden state')
parser.add_argument('--optimizer', type=str, default='Adam',
                    help='Adam/Adadelta/Adagrad/RMSProp/Momentum/SGD')
parser.add_argument('--CRF', type=str2bool, default=True,
                    help='use CRF at the top layer. if False, use Softmax')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
parser.add_argument('--dropout', type=float, default=0.5, help='dropout keep_prob')
parser.add_argument('--update_embedding', type=str2bool, default=True,
                    help='update embedding during training')
parser.add_argument('--pretrain_embedding', type=str, default='random',
                    help='use pretrained char embedding or init it randomly')
parser.add_argument('--embedding_dim', type=int, default=300,
                    help='random init char embedding_dim')
parser.add_argument('--shuffle', type=str2bool, default=True,
                    help='shuffle training data before each epoch')
parser.add_argument('--mode', type=str, default='test', help='train/test/demo')
parser.add_argument('--demo_model', type=str, default='1662621164',
                    help='model for test and demo')
args = parser.parse_args()

word2id = read_dictionary(args.word2id)
tag2label = read_tag_id(args.tag2id)


if args.pretrain_embedding == 'random':
    embeddings = random_embedding(word2id, args.embedding_dim)
else:
    embedding_path = 'pretrain_embedding.npy'
    embeddings = np.array(np.load(embedding_path), dtype='float32')


paths = {}
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join(args.save_path, timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

ckpt_file = tf.train.latest_checkpoint(model_path)
print(ckpt_file)
paths['model_path'] = ckpt_file
model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config)
model.build_graph()
saver = tf.train.Saver()

label2tag = {v:k for k,v in  pickle_reader(r"E:\project\Vlun_NER_LSTM\NerData\label_dic.pkl").items()}

with tf.Session(config=config) as sess:
    saver.restore(sess, ckpt_file)
    total = []
    textTemp = []
    print("请输入原始文本 ---  >>>>>>>>>>>>>>>>>>>>>>")
    sentence = input().lower()
    print(len(total))
    total.append([list(jieba.cut(sentence))])
    for sentence in total:
        demo_sent = sentence[0]
        demo_data = [[demo_sent], [['O'] * len(demo_sent)],[len(demo_sent)]]
        tag = model.demo_one(sess, demo_data)
        res = []
        for t in tag:
            if t == 0:
                res.append("O")
            else:
                res.append(t)
        content = (sentence[0])
        resDic = {"company":[],"product":[],"version":[],"cve":[]}
        for index,value in enumerate(res):
            if str(value).startswith("B"):
                type = value.split("_")[1]
                pointer = index
                while pointer < len(res):
                    if res[pointer] == "O":
                        resDic[type].append([' '.join(content[index:pointer]), (index,pointer)])
                        break
                    if res[pointer].split("_")[1] != type:
                        resDic[type].append([' '.join(content[index:pointer]), (index, pointer)])
                        break
                    else:
                        pointer += 1
            else:
                pass
        print(tag)
        print(resDic)
        print(sentence[0])
        print("++++++++++++++++++++++")



        """
        CVE-2022-24735 Redis Labs Redis是美国Redis Labs公司的一套开源的使用ANSI C编写、支持网络、可基于内存亦可持久化的日志型、键值(Key-Value)存储数据库,并提供多种语言的API。Redis 6.2.7 和 7.0.0 之前版本存在代码注入漏洞,该漏洞源于Lua 脚本执行环境存在问题。攻击者利用该漏洞可以实现以另一个Redis用户的权限执行脚本。
        
        
        FPT G-97RG6M和FPT G-97RG3都是越南FPT公司的一款调制解调器。FPT G-97RG6M R4.2.98.035版本、G-97RG3 R4.2.43.078版本存在安全漏洞,该漏洞源于在 ping 函数中容易受到远程命令执行的影响。
        
        
        linked-list-allocator是Rust OSDev开源的一个链表分配器代码库。linked-list-allocator 0.10.2 之前版本存在缓冲区错误漏洞,该漏洞源于堆初始化方法缺少对给定堆大小参数的最小大小检查,由于元数据写入操作,这可能会导致越界写入。
        
        
        Nagios XI是美国Nagios公司的一套IT基础设施监控解决方案。该方案支持对应用、服务、操作系统等进行监控和预警。Nagios XI v5.8.6版本存在安全漏洞,该漏洞源于通过Manage MIBs页面上的mib_name参数发现包含SQL注入漏洞。
        """

data_preprocessing.py

import codecs
import jieba

train_writer = codecs.open("NER_data.txt", 'w', 'UTF-8')

lines = codecs.open("aliyunSpider.txt", 'r', 'UTF-8').readlines()
for line in lines:
    try:
        line_dict = (eval(line.strip()))
        content = line_dict["content"]
        type = line_dict["type"]
        company = list(set(line_dict["company"]))
        cve_number = line_dict["cve_number"].lower()
        product = list(set(line_dict["product"]))
        version = list(set(line_dict["version"]))
        influence = list(set(line_dict["influence"]))
        title = line_dict["title"]
        tempDict = {}
        for company_token in company:
            company_token = str(company_token).lower()
            if "_" in company_token:
                if company_token.replace("_firmware", "").replace("_", " ") in content:
                    tempDict[company_token.replace("_firmware", "").replace("_", " ")] = "company"
                else:
                    pass
            else:
                if company_token in content:
                    tempDict[company_token] = "company"

        for product_token in product:
            product_token = product_token.lower()
            if "_" in product_token:
                if product_token.replace("_firmware", "").replace("_", " ") in content:
                    tempDict[product_token.replace("_firmware", "").replace("_", " ")] = "product"
                else:
                    pass
            else:
                if product_token in content:
                    tempDict[product_token] = "product"

        for version_token in version + influence:
            version_token = version_token.lower()
            if "_" in version_token:
                if version_token.replace("_firmware", "").replace("_", " ") in content:
                    tempDict[version_token.replace("_firmware", "").replace("_", " ")] = "version"
                else:
                    pass
            else:
                if version_token in content:
                    tempDict[version_token] = "version"
        finalContent = title.lower() + " " + content.lower()
        if cve_number in finalContent:
            tempDict[cve_number] = "cve_number"
        content_list = list(jieba.cut(finalContent))
        label_list = ["O" for _ in range(len(content_list))]
        for item, types in tempDict.items():
            counts = (str(finalContent).count(item))
            if counts > 1:
                item_list = (list(jieba.cut(item)))
                if len(item_list) == 1:

                    # 单个词 出现多次情况
                    start = 0
                    for i in range(counts):
                        if label_list[content_list.index(item, start + 1)] == "O":
                            label_list[content_list.index(item, start + 1)] = "B_{}".format(types)
                            start = content_list.index(item, start + 1)
                        else:
                            break
                else:
                    start = 0
                    for i in range(counts):
                        flag = True
                        index_num = start
                        while flag:
                            index_num = content_list.index(item_list[0], index_num)
                            if "".join(content_list[index_num:index_num + len(item_list)]) == "".join(item_list):
                                if label_list[index_num] == "O":
                                    label_list[index_num] = "B_{}".format(types)
                                    for i in range(index_num + 1, index_num + len(item_list) - 1):
                                        label_list[i] = "I_{}".format(types)
                                    label_list[index_num + len(item_list) - 1] = "E_{}".format(types)
                                    flag = False
                                else:
                                    break
                            else:
                                index_num += 1
                        start = index_num + len(item_list) - 1
            else:
                item_list = (list(jieba.cut(item)))
                if len(item_list) == 1:
                    if label_list[content_list.index(item)] == "O":
                        label_list[content_list.index(item)] = "B_{}".format(types)
                    else:
                        break
                else:
                    flag = True
                    index_num = 0
                    while flag:
                        index_num = content_list.index(item_list[0], index_num)
                        if "".join(content_list[index_num:index_num + len(item_list)]) == "".join(item_list):
                            if label_list[index_num] == "O":
                                label_list[index_num] = "B_{}".format(types)
                                for i in range(index_num + 1, index_num + len(item_list) - 1):
                                    label_list[i] = "I_{}".format(types)

                                label_list[index_num + len(item_list) - 1] = "E_{}".format(types)
                                flag = False
                            else:
                                break
                        else:
                            index_num += 1

        for index, item in enumerate(content_list):
            if item == " ":
                del content_list[index]
                del label_list[index]
        if (len(content_list) == len(label_list)):
            for index, token in enumerate(content_list):
                train_writer.write("{}\t{}\n".format(token, label_list[index]))
            train_writer.write("{}\n".format("<sentence split>"))
            train_writer.flush()
    except Exception as e:
        pass

data_processing.py

import codecs
import pickle
import random
from tqdm import tqdm

def write_pickle(fileName, obj):
    f = open(fileName, 'wb')
    pickle.dump(obj, f)
    f.close()

def load_pickle(fileName):
    f = open(fileName, 'rb')
    d = pickle.load(f)
    f.close()
    return d

def make_dict():
    print("正在生成词典")
    vocabulary = {}
    lines = codecs.open("NER_data.txt",'r','UTF-8').readlines()
    for line in tqdm(lines):
        line = (line.strip())
        if line != "<sentence split>":
            word = line.split('\t')[0]
            if word not in vocabulary:
                vocabulary[word] = 1
            else:
                vocabulary[word] += 1
    print(len(vocabulary))
    vocabulary_other = {}
    vocabulary_other["[PAD]"] = 0
    vocabulary_other["[UNK]"] = 1
    for k,v in vocabulary.items():
        if v > 5:
            vocabulary_other[k] = len(vocabulary_other)
    print(len(vocabulary_other))
    for k,v in vocabulary_other.items():
        print(k,v)
    write_pickle("word_dic.pkl",vocabulary_other)



def make_label_dic():
    vocabulary = {}
    lines = codecs.open("NER_data.txt",'r','UTF-8').readlines()
    for line in tqdm(lines):
        line = (line.strip())
        if line != "<sentence split>" and len(line.split('\t')) > 1:
            label = line.split('\t')[1]
            if label not in vocabulary:
                vocabulary[label] = len(vocabulary)
    for k,v in vocabulary.items():
        print(k,v)
    write_pickle("label_dic.pkl",vocabulary)


def make_dataset():
    lines = codecs.open("NER_data.txt", 'r', 'UTF-8').readlines()
    total = []
    temp = []
    for line in lines:
        if len(line.strip().split("\t")) > 1:
            temp.append(line)
        if line.strip() == "<sentence split>":
            temp = []
            total.append(temp)

    print(len(total))
    random.shuffle(total)
    train = total[:int(len(total) * 0.9)]
    test = total[int(len(total) * 0.9):]
    print(len(train))
    print(len(test))
    writer = codecs.open("train.txt",'w',"UTF-8")
    for item in train:
        for word in item:
            writer.write(word)
        writer.write("\n")
    writer.close()

    writer = codecs.open("test.txt",'w',"UTF-8")
    for item in test:
        for word in item:
            writer.write(word)
        writer.write("\n")
    writer.close()


if __name__ == '__main__':
    make_dict()
    make_label_dic()
    make_dataset()
    load_pickle("label_dic.pkl")

以上是漏洞类情报信息抽取的全部内容,利用这些数据也可以进行二分类,用于预测漏洞文本所表述的是系统漏洞和软件漏洞,后续将对这部分进行梳理,蟹蟹~