1. bert pb模型导出

# encoding:utf-8
import shutil

from model.create_model import BertClassifier
from model.model_config import BertConfig
import os
import tensorflow as tf
import os


def export_model(checkpoint_dir, epoch, model_version, vocab_path, bert_config_path, label_type_path):
    export_path_base = os.path.join(checkpoint_dir, "serving_output")
    if os.path.exists(export_path_base):
        shutil.rmtree(export_path_base)

    with tf.compat.v1.get_default_graph().as_default():
        inputs = {}
        inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)
        inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32)
        inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)

        bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
        labels = [c[1:].strip() for c in open(label_type_path).readlines()]

        bert_classifier = BertClassifier(bert_config, 128, len(labels), output="predictions")

        if epoch < 10:
            latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
        else:
            latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
        print(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)
        bert_classifier._set_inputs(inputs)

        print('export dir:', export_path_base)
        config_dir = os.path.join(checkpoint_dir, 'configs')
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        print('config dir :', config_dir)

        shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
        shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))

        export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
        print('export model path :', export_path)
        bert_classifier.save(export_path, save_format='tf')


if __name__ == '__main__':
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"

    # v4.3
    checkpoint_path = '...'
    vocab_path = '...txt'
    bert_config_path = "...json"
    epoch = 4
    model_version = 5
    export_model(checkpoint_path, epoch, model_version, vocab_path, bert_config_path, label_type_path)

2. go 调用tf-gpu

方案1：经过验证无效，很蛋疼

model, err := tf.LoadSavedModel(modelPath, []string{"serve"}, SessionOptions)

如果SessionOptions传nil的话，默认会把显存占满；需要控制显存大小的话，则需要设置该参数。

而SessionOptions的构建又比较麻烦：需要编译protobuf格式的数据

type SessionOptions struct {
	// Target indicates the TensorFlow runtime to connect to.
	//
	// If 'target' is empty or unspecified, the local TensorFlow runtime
	// implementation will be used.  Otherwise, the TensorFlow engine
	// defined by 'target' will be used to perform all computations.
	//
	// "target" can be either a single entry or a comma separated list
	// of entries. Each entry is a resolvable address of one of the
	// following formats:
	//   local
	//   ip:port
	//   host:port
	//   ... other system-specific formats to identify tasks and jobs ...
	//
	// NOTE: at the moment 'local' maps to an in-process service-based
	// runtime.
	//
	// Upon creation, a single session affines itself to one of the
	// remote processes, with possible load balancing choices when the
	// "target" resolves to a list of possible processes.
	//
	// If the session disconnects from the remote process during its
	// lifetime, session calls may fail immediately.
	Target string

	// Config is a binary-serialized representation of the
	// tensorflow.ConfigProto protocol message
	// (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
	Config []byte
}

message GPUOptions {
  // Fraction of the available GPU memory to allocate for each process.
  // 1 means to allocate all of the GPU memory, 0.5 means the process
  // allocates up to ~50% of the available GPU memory.
  //
  // GPU memory is pre-allocated unless the allow_growth option is enabled.
  //
  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
  // the amount of memory available on the GPU device by using host memory as a
  // swap space. Accessing memory not available on the device will be
  // significantly slower as that would require memory transfer between the host
  // and the device. Options to reduce the memory requirement should be
  // considered before enabling this option as this may come with a negative
  // performance impact. Oversubscription using the unified memory requires
  // Pascal class or newer GPUs and it is currently only supported on the Linux
  // operating system. See
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
  // for the detailed requirements.
  double per_process_gpu_memory_fraction = 1;

  // If true, the allocator does not pre-allocate the entire specified
  // GPU memory region, instead starting small and growing as needed.
  bool allow_growth = 4;

  // The type of GPU allocation strategy to use.
  //
  // Allowed values:
  // "": The empty string (default) uses a system-chosen default
  //     which may change over time.
  //
  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
  //        version of dlmalloc.
  string allocator_type = 2;

  // Delay deletion of up to this many bytes to reduce the number of
  // interactions with gpu driver code.  If 0, the system chooses
  // a reasonable default (several MBs).
  int64 deferred_deletion_bytes = 3;

  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
  // then one would specify this field as "5,3".  This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
  //
  // NOTE:
  // 1. The GPU driver provides the process with the visible GPUs
  //    in an order which is not guaranteed to have any correlation to
  //    the *physical* GPU id in the machine.  This field is used for
  //    remapping "visible" to "virtual", which means this operates only
  //    after the process starts.  Users are required to use vendor
  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
  //    physical to visible device mapping prior to invoking TensorFlow.
  // 2. In the code, the ids in this list are also called "platform GPU id"s,
  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
  //    for more information.
  string visible_device_list = 5;

  // In the event polling loop sleep this many microseconds between
  // PollEvents calls, when the queue is not empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_active_delay_usecs = 6;

  // This field is deprecated and ignored.
  int32 polling_inactive_delay_msecs = 7;

  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
  // enabling this option forces all CPU tensors to be allocated with Cuda
  // pinned memory. Normally, TensorFlow will infer which tensors should be
  // allocated as the pinned memory. But in case where the inference is
  // incomplete, this option can significantly speed up the cross-device memory
  // copy performance as long as it fits the memory.
  // Note that this option is not something that should be
  // enabled by default for unknown or very large models, since all Cuda pinned
  // memory is unpageable, having too much pinned memory might negatively impact
  // the overall host system performance.
  bool force_gpu_compatible = 8;

  message Experimental {
    // Configuration for breaking down a visible GPU into multiple "virtual"
    // devices.
    message VirtualDevices {
      // Per "virtual" device memory limit, in MB. The number of elements in
      // the list is the number of virtual devices to create on the
      // corresponding visible GPU (see "virtual_devices" below).
      // If empty, it will create single virtual device taking all available
      // memory from the device.
      //
      // For the concept of "visible" and "virtual" GPU, see the comments for
      // "visible_device_list" above for more information.
      repeated float memory_limit_mb = 1;

      // Priority values to use with the virtual devices. Use the cuda function
      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
      // priority.
      //
      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
      // least priority and -1 for greatest priority.
      //
      // If this field is not specified, then the virtual devices will be
      // created with the default. If this field has values set, then the size
      // of this must match with the above memory_limit_mb.
      repeated int32 priority = 2;
    }

    // The multi virtual device settings. If empty (not set), it will create
    // single virtual device on each visible GPU, according to the settings
    // in "visible_device_list" above. Otherwise, the number of elements in the
    // list must be the same as the number of visible GPUs (after
    // "visible_device_list" filtering if it is set), and the string represented
    // device names (e.g. /device:GPU:<id>) will refer to the virtual
    // devices and have the <id> field assigned sequentially starting from 0,
    // according to the order they appear in this list and the "memory_limit"
    // list inside each element. For example,
    //   visible_device_list = "1,0"
    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
    //   virtual_devices {}
    // will create three virtual devices as:
    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
    //   /device:GPU:2 -> visible GPU 0 with all available memory
    //
    // NOTE:
    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
    //    at the same time.
    // 2. Currently this setting is per-process, not per-session. Using
    //    different settings in different sessions within same process will
    //    result in undefined behavior.
    repeated VirtualDevices virtual_devices = 1;

    // If true, uses CUDA unified memory for memory allocations. If
    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
    // memory is used regardless of the value for this field. See comments for
    // per_process_gpu_memory_fraction field for more details and requirements
    // of the unified memory. This option is useful to oversubscribe memory if
    // multiple processes are sharing a single GPU while individually using less
    // than 1.0 per process memory fraction.
    bool use_unified_memory = 2;

    // If > 1, the number of device-to-device copy streams to create
    // for each GPUDevice.  Default value is 0, which is automatically
    // converted to 1.
    int32 num_dev_to_dev_copy_streams = 3;

    // If non-empty, defines a good GPU ring order on a single worker based on
    // device interconnect.  This assumes that all workers have the same GPU
    // topology.  Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
    // This ring order is used by the RingReducer implementation of
    // CollectiveReduce, and serves as an override to automatic ring order
    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
    string collective_ring_order = 4;

    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
    // keep track of when GPU memory is freed and when kernels actually
    // complete so that we can know when a nominally free memory chunk
    // is really not subject to pending use.
    bool timestamped_allocator = 5;

    // reserved id: 6

    // Parameters for GPUKernelTracker.  By default no kernel tracking is done.
    // Note that timestamped_allocator is only effective if some tracking is
    // specified.
    //
    // If kernel_tracker_max_interval = n > 0, then a tracking event
    // is inserted after every n kernels without an event.
    int32 kernel_tracker_max_interval = 7;
    // If kernel_tracker_max_bytes = n > 0, then a tracking event is
    // inserted after every series of kernels allocating a sum of
    // memory >= n.  If one kernel allocates b * n bytes, then one
    // event will be inserted after it, but it will count as b against
    // the pending limit.
    int32 kernel_tracker_max_bytes = 8;
    // If kernel_tracker_max_pending > 0 then no more than this many
    // tracking events can be outstanding at a time.  An attempt to
    // launch an additional kernel will stall until an event
    // completes.
    int32 kernel_tracker_max_pending = 9;

    // BFC Allocator can return an allocated chunk of memory upto 2x the
    // requested size. For virtual devices with tight memory constraints, and
    // proportionately large allocation requests, this can lead to a significant
    // reduction in available memory. The threshold below controls when a chunk
    // should be split if the chunk size exceeds requested memory size. It is
    // expressed as a fraction of total available memory for the tf device. For
    // example setting it to 0.05 would imply a chunk needs to be split if its
    // size exceeds the requested memory by 5% of the total virtual device/gpu
    // memory size.
    double internal_fragmentation_fraction = 10;

    // When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
    bool use_cuda_malloc_async = 11;
  }

  // Everything inside experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  Experimental experimental = 9;
}

方案2：参考了`python`的设置方式

python中可直接这样设置

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

因而想到，直接启动容器的时候加上该环境变量看看是否可行

docker run -e TF_FORCE_GPU_ALLOW_GROWTH=true

或者在dockerfile中加入export TF_FORCE_GPU_ALLOW_GROWTH=true

3. huggingface/transformers 下载模型小技巧

    model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

会自动下载模型参数等，同时下载下来的数据是存放在model_args.cache_dir中，但这样有一个缺点：下载下来的模型等没有实际含义的文件名，可读性差

使用手动下载方式：

git lfs install
git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext-large

然后删除文件夹中的.git文件夹，然后指定model_name_or_path为本地的路径

  "model_name_or_path": "./cache/chinese-roberta-wwm-ext-large",

这样就清爽了

4. chinese-roberta-wwm-ext-large 训练文本分类

出发点是在大模型上训练，训练出高精度的模型，以挑选出可疑的脏数据，结果训练了10个epochs效果很差，还在找原因...

5. 导出的pb模型输入输出查看

方式1：网上可收到的比较麻烦的方法

step1 找到saved_model_cli路径：

from tensorflow.python.tools import saved_model_cli

print(saved_model_cli.__file__)
# output: /xxxx/xxx/.local/lib/python3.6/site-packages/tensorflow/python/tools/saved_model_cli.py

step2 命令行中运行：

cd /xxxx/xxx/.local/lib/python3.6/site-packages/tensorflow/python/tools/
python saved_model_cli.py show --dir 含pb模型的目录 --all

output:

方式2 代码执行：

通过阅读源码填入方法的对应参数即可

import argparse

from tensorflow.python.tools import saved_model_cli


def print_model_input_output(model_dir):
    args = argparse.ArgumentParser()
    args.all = True
    args.dir = model_dir
    saved_model_cli.show(args)


if __name__ == '__main__':
    model_dir = '/data_nvme/dxq/text_filter_dl/bert_tf2/output_model/v4.3/serving_output0708/1'
    print_model_input_output(model_dir)

5. 指定模型输入名称

  with tf.compat.v1.get_default_graph().as_default():
        inputs = {}
        inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')
        inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_mask')
        inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_type_ids')

        bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
        labels = [c[1:].strip() for c in open(label_type_path).readlines()]

        bert_classifier = BertClassifier(bert_config, 128, len(labels), output="predictions")

        if epoch < 10:
            latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
        else:
            latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
        print(latest_ckpt)

        # bert_classifier.load_weights(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)
        bert_classifier._set_inputs(inputs)
        print('export dir:', export_path_base)
        config_dir = os.path.join(checkpoint_dir, 'configs')
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        print('config dir :', config_dir)

        shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
        shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))
        bert_classifier.summary()
        export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
        print('export model path :', export_path)
        bert_classifier.save(export_path, save_format='tf')

特别注意：需要指定name参数才会生效

inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')

6. plt显示中文

blog.csdn.net/Disany/arti…

7. 文本分类经验

预训练模型用BERT-wwm-ext比用BERT的分类效果要好
过采样会降低样本比大的recall

【技术周报】七月第二周