1. bert pb模型导出
# encoding:utf-8
import shutil
from model.create_model import BertClassifier
from model.model_config import BertConfig
import os
import tensorflow as tf
import os
def export_model(checkpoint_dir, epoch, model_version, vocab_path, bert_config_path, label_type_path):
export_path_base = os.path.join(checkpoint_dir, "serving_output")
if os.path.exists(export_path_base):
shutil.rmtree(export_path_base)
with tf.compat.v1.get_default_graph().as_default():
inputs = {}
inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)
inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32)
inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)
bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
labels = [c[1:].strip() for c in open(label_type_path).readlines()]
bert_classifier = BertClassifier(bert_config, 128, len(labels), output="predictions")
if epoch < 10:
latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
else:
latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
print(latest_ckpt)
bert_classifier.load_weights(latest_ckpt)
bert_classifier.load_weights(latest_ckpt)
bert_classifier._set_inputs(inputs)
print('export dir:', export_path_base)
config_dir = os.path.join(checkpoint_dir, 'configs')
if not os.path.exists(config_dir):
os.mkdir(config_dir)
print('config dir :', config_dir)
shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))
export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
print('export model path :', export_path)
bert_classifier.save(export_path, save_format='tf')
if __name__ == '__main__':
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
# v4.3
checkpoint_path = '...'
vocab_path = '...txt'
bert_config_path = "...json"
epoch = 4
model_version = 5
export_model(checkpoint_path, epoch, model_version, vocab_path, bert_config_path, label_type_path)
2. go 调用tf-gpu
方案1:经过验证无效,很蛋疼
model, err := tf.LoadSavedModel(modelPath, []string{"serve"}, SessionOptions)
如果SessionOptions
传nil
的话,默认会把显存占满;需要控制显存大小的话,则需要设置该参数。
而SessionOptions
的构建又比较麻烦:需要编译protobuf
格式的数据
type SessionOptions struct {
// Target indicates the TensorFlow runtime to connect to.
//
// If 'target' is empty or unspecified, the local TensorFlow runtime
// implementation will be used. Otherwise, the TensorFlow engine
// defined by 'target' will be used to perform all computations.
//
// "target" can be either a single entry or a comma separated list
// of entries. Each entry is a resolvable address of one of the
// following formats:
// local
// ip:port
// host:port
// ... other system-specific formats to identify tasks and jobs ...
//
// NOTE: at the moment 'local' maps to an in-process service-based
// runtime.
//
// Upon creation, a single session affines itself to one of the
// remote processes, with possible load balancing choices when the
// "target" resolves to a list of possible processes.
//
// If the session disconnects from the remote process during its
// lifetime, session calls may fail immediately.
Target string
// Config is a binary-serialized representation of the
// tensorflow.ConfigProto protocol message
// (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
Config []byte
}
message GPUOptions {
// Fraction of the available GPU memory to allocate for each process.
// 1 means to allocate all of the GPU memory, 0.5 means the process
// allocates up to ~50% of the available GPU memory.
//
// GPU memory is pre-allocated unless the allow_growth option is enabled.
//
// If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
// the amount of memory available on the GPU device by using host memory as a
// swap space. Accessing memory not available on the device will be
// significantly slower as that would require memory transfer between the host
// and the device. Options to reduce the memory requirement should be
// considered before enabling this option as this may come with a negative
// performance impact. Oversubscription using the unified memory requires
// Pascal class or newer GPUs and it is currently only supported on the Linux
// operating system. See
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
// for the detailed requirements.
double per_process_gpu_memory_fraction = 1;
// If true, the allocator does not pre-allocate the entire specified
// GPU memory region, instead starting small and growing as needed.
bool allow_growth = 4;
// The type of GPU allocation strategy to use.
//
// Allowed values:
// "": The empty string (default) uses a system-chosen default
// which may change over time.
//
// "BFC": A "Best-fit with coalescing" algorithm, simplified from a
// version of dlmalloc.
string allocator_type = 2;
// Delay deletion of up to this many bytes to reduce the number of
// interactions with gpu driver code. If 0, the system chooses
// a reasonable default (several MBs).
int64 deferred_deletion_bytes = 3;
// A comma-separated list of GPU ids that determines the 'visible'
// to 'virtual' mapping of GPU devices. For example, if TensorFlow
// can see 8 GPU devices in the process, and one wanted to map
// visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
// then one would specify this field as "5,3". This field is similar in
// spirit to the CUDA_VISIBLE_DEVICES environment variable, except
// it applies to the visible GPU devices in the process.
//
// NOTE:
// 1. The GPU driver provides the process with the visible GPUs
// in an order which is not guaranteed to have any correlation to
// the *physical* GPU id in the machine. This field is used for
// remapping "visible" to "virtual", which means this operates only
// after the process starts. Users are required to use vendor
// specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
// physical to visible device mapping prior to invoking TensorFlow.
// 2. In the code, the ids in this list are also called "platform GPU id"s,
// and the 'virtual' ids of GPU devices (i.e. the ids in the device
// name "/device:GPU:<id>") are also called "TF GPU id"s. Please
// refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
// for more information.
string visible_device_list = 5;
// In the event polling loop sleep this many microseconds between
// PollEvents calls, when the queue is not empty. If value is not
// set or set to 0, gets set to a non-zero default.
int32 polling_active_delay_usecs = 6;
// This field is deprecated and ignored.
int32 polling_inactive_delay_msecs = 7;
// Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
// enabling this option forces all CPU tensors to be allocated with Cuda
// pinned memory. Normally, TensorFlow will infer which tensors should be
// allocated as the pinned memory. But in case where the inference is
// incomplete, this option can significantly speed up the cross-device memory
// copy performance as long as it fits the memory.
// Note that this option is not something that should be
// enabled by default for unknown or very large models, since all Cuda pinned
// memory is unpageable, having too much pinned memory might negatively impact
// the overall host system performance.
bool force_gpu_compatible = 8;
message Experimental {
// Configuration for breaking down a visible GPU into multiple "virtual"
// devices.
message VirtualDevices {
// Per "virtual" device memory limit, in MB. The number of elements in
// the list is the number of virtual devices to create on the
// corresponding visible GPU (see "virtual_devices" below).
// If empty, it will create single virtual device taking all available
// memory from the device.
//
// For the concept of "visible" and "virtual" GPU, see the comments for
// "visible_device_list" above for more information.
repeated float memory_limit_mb = 1;
// Priority values to use with the virtual devices. Use the cuda function
// cudaDeviceGetStreamPriorityRange to query for valid range of values for
// priority.
//
// On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
// least priority and -1 for greatest priority.
//
// If this field is not specified, then the virtual devices will be
// created with the default. If this field has values set, then the size
// of this must match with the above memory_limit_mb.
repeated int32 priority = 2;
}
// The multi virtual device settings. If empty (not set), it will create
// single virtual device on each visible GPU, according to the settings
// in "visible_device_list" above. Otherwise, the number of elements in the
// list must be the same as the number of visible GPUs (after
// "visible_device_list" filtering if it is set), and the string represented
// device names (e.g. /device:GPU:<id>) will refer to the virtual
// devices and have the <id> field assigned sequentially starting from 0,
// according to the order they appear in this list and the "memory_limit"
// list inside each element. For example,
// visible_device_list = "1,0"
// virtual_devices { memory_limit: 1GB memory_limit: 2GB }
// virtual_devices {}
// will create three virtual devices as:
// /device:GPU:0 -> visible GPU 1 with 1GB memory
// /device:GPU:1 -> visible GPU 1 with 2GB memory
// /device:GPU:2 -> visible GPU 0 with all available memory
//
// NOTE:
// 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
// at the same time.
// 2. Currently this setting is per-process, not per-session. Using
// different settings in different sessions within same process will
// result in undefined behavior.
repeated VirtualDevices virtual_devices = 1;
// If true, uses CUDA unified memory for memory allocations. If
// per_process_gpu_memory_fraction option is greater than 1.0, then unified
// memory is used regardless of the value for this field. See comments for
// per_process_gpu_memory_fraction field for more details and requirements
// of the unified memory. This option is useful to oversubscribe memory if
// multiple processes are sharing a single GPU while individually using less
// than 1.0 per process memory fraction.
bool use_unified_memory = 2;
// If > 1, the number of device-to-device copy streams to create
// for each GPUDevice. Default value is 0, which is automatically
// converted to 1.
int32 num_dev_to_dev_copy_streams = 3;
// If non-empty, defines a good GPU ring order on a single worker based on
// device interconnect. This assumes that all workers have the same GPU
// topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
// This ring order is used by the RingReducer implementation of
// CollectiveReduce, and serves as an override to automatic ring order
// generation in OrderTaskDeviceMap() during CollectiveParam resolution.
string collective_ring_order = 4;
// If true then extra work is done by GPUDevice and GPUBFCAllocator to
// keep track of when GPU memory is freed and when kernels actually
// complete so that we can know when a nominally free memory chunk
// is really not subject to pending use.
bool timestamped_allocator = 5;
// reserved id: 6
// Parameters for GPUKernelTracker. By default no kernel tracking is done.
// Note that timestamped_allocator is only effective if some tracking is
// specified.
//
// If kernel_tracker_max_interval = n > 0, then a tracking event
// is inserted after every n kernels without an event.
int32 kernel_tracker_max_interval = 7;
// If kernel_tracker_max_bytes = n > 0, then a tracking event is
// inserted after every series of kernels allocating a sum of
// memory >= n. If one kernel allocates b * n bytes, then one
// event will be inserted after it, but it will count as b against
// the pending limit.
int32 kernel_tracker_max_bytes = 8;
// If kernel_tracker_max_pending > 0 then no more than this many
// tracking events can be outstanding at a time. An attempt to
// launch an additional kernel will stall until an event
// completes.
int32 kernel_tracker_max_pending = 9;
// BFC Allocator can return an allocated chunk of memory upto 2x the
// requested size. For virtual devices with tight memory constraints, and
// proportionately large allocation requests, this can lead to a significant
// reduction in available memory. The threshold below controls when a chunk
// should be split if the chunk size exceeds requested memory size. It is
// expressed as a fraction of total available memory for the tf device. For
// example setting it to 0.05 would imply a chunk needs to be split if its
// size exceeds the requested memory by 5% of the total virtual device/gpu
// memory size.
double internal_fragmentation_fraction = 10;
// When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
bool use_cuda_malloc_async = 11;
}
// Everything inside experimental is subject to change and is not subject
// to API stability guarantees in
// https://www.tensorflow.org/guide/version_compat.
Experimental experimental = 9;
}
方案2:参考了python
的设置方式
python中可直接这样设置
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
因而想到,直接启动容器的时候加上该环境变量看看是否可行
docker run -e TF_FORCE_GPU_ALLOW_GROWTH=true
或者在dockerfile
中加入export TF_FORCE_GPU_ALLOW_GROWTH=true
3. huggingface/transformers 下载模型小技巧
model = TFAutoModelForSequenceClassification.from_pretrained(
model_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
会自动下载模型参数等,同时下载下来的数据是存放在model_args.cache_dir
中,但这样有一个缺点:下载下来的模型等没有实际含义的文件名,可读性差
使用手动下载方式:
git lfs install
git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext-large
然后删除文件夹中的.git
文件夹,然后指定model_name_or_path
为本地的路径
"model_name_or_path": "./cache/chinese-roberta-wwm-ext-large",
这样就清爽了
4. chinese-roberta-wwm-ext-large 训练文本分类
出发点是在大模型上训练,训练出高精度的模型,以挑选出可疑的脏数据,结果训练了10个epochs
效果很差,还在找原因...
5. 导出的pb模型输入输出查看
方式1:网上可收到的比较麻烦的方法
step1 找到saved_model_cli
路径:
from tensorflow.python.tools import saved_model_cli
print(saved_model_cli.__file__)
# output: /xxxx/xxx/.local/lib/python3.6/site-packages/tensorflow/python/tools/saved_model_cli.py
step2 命令行中运行:
cd /xxxx/xxx/.local/lib/python3.6/site-packages/tensorflow/python/tools/
python saved_model_cli.py show --dir 含pb模型的目录 --all
output:
方式2 代码执行:
通过阅读源码填入方法的对应参数即可
import argparse
from tensorflow.python.tools import saved_model_cli
def print_model_input_output(model_dir):
args = argparse.ArgumentParser()
args.all = True
args.dir = model_dir
saved_model_cli.show(args)
if __name__ == '__main__':
model_dir = '/data_nvme/dxq/text_filter_dl/bert_tf2/output_model/v4.3/serving_output0708/1'
print_model_input_output(model_dir)
5. 指定模型输入名称
with tf.compat.v1.get_default_graph().as_default():
inputs = {}
inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')
inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_mask')
inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_type_ids')
bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
labels = [c[1:].strip() for c in open(label_type_path).readlines()]
bert_classifier = BertClassifier(bert_config, 128, len(labels), output="predictions")
if epoch < 10:
latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
else:
latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
print(latest_ckpt)
# bert_classifier.load_weights(latest_ckpt)
bert_classifier.load_weights(latest_ckpt)
bert_classifier._set_inputs(inputs)
print('export dir:', export_path_base)
config_dir = os.path.join(checkpoint_dir, 'configs')
if not os.path.exists(config_dir):
os.mkdir(config_dir)
print('config dir :', config_dir)
shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))
bert_classifier.summary()
export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
print('export model path :', export_path)
bert_classifier.save(export_path, save_format='tf')
特别注意:需要指定name
参数才会生效
inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')
6. plt显示中文
7. 文本分类经验
- 预训练模型用
BERT-wwm-ext
比用BERT
的分类效果要好 - 过采样会降低样本比大的
recall