NLP自然语言处理实战-实现AI起名功能

3,362 阅读2分钟

源码地址

LSTM模型简介

如果不想看原理的可以略过,直接看下面的代码部分。LSTM具体实现原理网上有很多,这里就不具体讲了。

长短期记忆模型(long-short term memory)是一种特殊的RNN模型,是为了解决RNN模型梯度弥散的问题而提出的;在传统的RNN中,训练算法使用的是BPTT,当时间比较长时,需要回传的残差会指数下降,导致网络权重更新缓慢,无法体现出RNN的长期记忆的效果,因此需要一个存储单元来存储记忆,因此LSTM模型被提出

20220513-115940.jpeg

代码

1.构建中文词典tokenizer

class NameTokenizer():
    def __init__(self, path, required_gender="男"):

        self.names = []
        self._tokens = []
        self._tokens += ['0', '\n']
        with open(path, mode='r', encoding="utf-8-sig") as f:
            for idx, line in enumerate(f.readlines()):

                line = line.strip()
                line = line.split(",")
                gender = line[1]

                if gender == required_gender or gender == "未知":
                    name = line[0]
                    self.names.append(name)
                    for char in name:
                        self._tokens.append(char)

        # 创建词典 token->id映射关系
        self._tokens = sorted(list(set(self._tokens)))
        self.token_id_dict = dict((c, i) for i, c in enumerate(self._tokens))
        self.id_token_dict = dict((i, c) for i, c in enumerate(self._tokens))
        self.vocab = len(self._tokens)

2.构建训练数据集


def generate_data(tokenizer: NameTokenizer):

    total = len(tokenizer.names)
    while True:

        for start in range(0, total, BATCH_SIZE):
            end = min(start + BATCH_SIZE, total)

            sequences = []
            next_chars = []

            for name in tokenizer.names[start:end]:
                s = name + (MAX_LEN - len(name))*'0'
                sequences.append(s)
                next_chars.append('\n')
                for it, j in enumerate(name):
                    if (it >= len(name)-1):
                        continue
                    s = name[:-1-it]+(MAX_LEN - len(name[:-1-it]))*'0'
                    sequences.append(s)
                    next_chars.append(name[-1-it])

                # print(sequences[:10])
                # print(next_chars[:10])
            x_train = np.zeros((len(sequences), MAX_LEN, tokenizer.vocab))
            y_train = np.zeros((len(next_chars), tokenizer.vocab))
            for idx, seq in enumerate(sequences):
                for t, char in enumerate(seq):
                    x_train[idx, t, tokenizer.token_id_dict[char]] = 1
            for idx, char in enumerate(next_chars):
                y_train[idx, tokenizer.token_id_dict[char]] = 1
            yield x_train, y_train
            del sequences,next_chars,x_train,y_train

3.训练

def train():

    tokenizer = NameTokenizer(path=PATH_DATA, required_gender=GENDER)
    steps = int(math.floor(len(tokenizer.names) / BATCH_SIZE))
    print("step:"+steps)
    # # 构建模型
    model = keras.Sequential([
        keras.Input(shape=(MAX_LEN, tokenizer.vocab)),
        # 第一个LSTM层,返回序列作为下一层的输入
        layers.LSTM(STATE_DIM, dropout=DROP_OUT, return_sequences=False),
        layers.Dense(tokenizer.vocab, activation='softmax')
    ])
    model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.fit_generator(generate_data(tokenizer),
                        steps_per_epoch=steps, epochs=EPOCHS)

    model.save("mymodel")

4.训练结果

20220513-092242.jpeg

5.预测

def sample(preds, temperature=1):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def generateNames(prefix,size):

    tokenizer = NameTokenizer(path=PATH_DATA, required_gender=GENDER)
    model = keras.models.load_model(PATH_MODEL)
    preds = set()
    
    
    tmp_generated = prefix

    for char in tmp_generated:
        if char not in tokenizer.token_id_dict:
            print("字典中没有这个字")
            return

    sequence = ('{0:0<' + str(MAX_LEN) + '}').format(prefix).lower()
    
    while len(preds) < size:

        x_pre = np.zeros((1, MAX_LEN, tokenizer.vocab))
        for t, char in enumerate(sequence):
            x_pre[0, t, tokenizer.token_id_dict[char]] = 1
        output = model.predict(x_pre, verbose=0)[0]
        index = sample(output)
        char = tokenizer.id_token_dict[index]

        if(char == '0' or char == '\n'):
            preds.add(tmp_generated)
            tmp_generated = prefix
            sequence = ('{0:0<' + str(MAX_LEN) + '}').format(prefix).lower()
        else:
            tmp_generated += char
            sequence = (
                '{0:0<' + str(MAX_LEN) + '}').format(tmp_generated).lower()
        print(sequence)

        if(len(sequence) > MAX_LEN):
            tmp_generated = prefix
            sequence = ('{0:0<' + str(MAX_LEN) + '}').format(prefix).lower()
    return preds

6.预测结果

20220513-092253.jpeg

完整代码已经上传到了github上,开源不易,希望多多点赞。