群体角色动画实时渲染技术研究
本文基于OpenGL 460,DSA模式,实现一万个士兵的动画渲染。
项目地址:limingyin18/car (github.com)
方法主要由实例化渲染,模型LOD,间接渲染,基于GPU的视锥剔除,角色动画构成。
基本的opengl使用,模型读取,骨骼动画暂时不在这里写,参考已经有很好的资料。
实例化渲染
使用glDrawElementsInstanced函数,进行实例化渲染。即渲染相同网格,相同材质,但是位置不一样时,只调用一次draw call,不需要重复调用glDrawElements,减少CPU与GPU之间的通信,提升了速度。 一般地:
glDrawElementsInstanced(GL_TRIANGLES, indices_count, GL_UNSIGNED_INT, 0, instance_count);
相当于:
for (int i = 0; i < instance_count ; i++) {
glDrawElements(GL_TRIANGLES, indices_count, GL_UNSIGNED_INT, 0);
}
在顶点着色器中,由内建变量gl_InstanceID来区分当前正在进行哪个实例的渲染。可以用来索引数组,如位姿矩阵:
layout(std430, binding = 0) buffer InstanceBuffer
{
mat4 modelMatrices[];
};
mat4 model = modelMatrices[gl_InstanceID];
位姿矩阵缓冲使用ssbo类型,其创建、绑定和更新是:
glCreateBuffers(1, &instance_transforms_buffer_);
glNamedBufferStorage(instance_transforms_buffer_, instance_transforms_.size() * sizeof(glm::mat4),
instance_transforms_.data(), GL_DYNAMIC_STORAGE_BIT);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, instance_transforms_buffer_);
glNamedBufferSubData(instance_transforms_buffer_, 0, instance_transforms_.size() * sizeof(glm::mat4),
instance_transforms_.data());
可以让美术制作纹理,相当于色板,能够偏移改变模型某些区域的颜色,从而实现如士兵在不同的阵营等。 (拓展,无绑定,虚拟纹理virtual texture?)
动画
由于使用实例化渲染,动画要先写到ssbo中,才能被gl_InstanceID索引。
动画烘培
// bake animation
float ticks_per_second = animations_[animation_clip_type]->GetTicksPerSecond();
float dt = 10.0f / ticks_per_second;
for (int i = 0; i <= frame_counts_[animation_clip_type] / 10; i++)
{
if (i == 0)
{
animator_->UpdateAnimation(0);
}
else
{
animator_->UpdateAnimation(dt);
}
auto transforms = animator_->GetFinalBoneMatrices();
for (int j = 0; j < MAX_BONES; j++)
{
bone_transforms_[animation_clip_type][i * MAX_BONES + j] = transforms[j];
}
}
i是第几帧,j是第几个骨骼(设定最多100个),即第2个动画片段,第3帧,第5号骨骼的位姿矩阵是bone_transforms_[2][3 * 100 + 5]
烘培是比较费时间的,所以一般这个数据会保存到硬盘中,程序启动时读取。
动画缓冲的构建、绑定
frame_count_offset_.resize(bone_transforms_.size() + 1);
frame_count_offset_[0] = 0;
for (size_t i = 1; i < bone_transforms_.size() + 1; i++)
{
frame_count_offset_[i] = frame_count_offset_[i - 1] + bone_transforms_[i - 1].size();
}
glCreateBuffers(1, &frame_count_offset_buffer_object_);
glNamedBufferStorage(frame_count_offset_buffer_object_, frame_count_offset_.size() * sizeof(uint32_t),
frame_count_offset_.data(), 0);
glCreateBuffers(1, &bone_transforms_buffer_object_);
glNamedBufferStorage(bone_transforms_buffer_object_, frame_count_offset_.back() * sizeof(glm::mat4), 0,
GL_DYNAMIC_STORAGE_BIT);
for (size_t i = 0; i < bone_transforms_.size(); i++)
{
glNamedBufferSubData(bone_transforms_buffer_object_, frame_count_offset_[i] * sizeof(glm::mat4),
bone_transforms_[i].size() * sizeof(glm::mat4), bone_transforms_[i].data());
}
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, bone_transforms_buffer_object_);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, frame_count_offset_buffer_object_);
frame_count_offset是每个动画片段的帧数累加,从0开始。将所有动画片段都写到一个缓冲,使用frame_count_offset区分。
实例化使用
在着色器中,先定义每个实例的数据:动画的片段,当前帧
struct ObjectData
{
uint clip;
uint frame;
};
layout(std430, binding = 2) buffer ObjectDataBuffer
{
ObjectData objectData[];
};
layout(location = 5) in ivec4 boneIds;
for (int i = 0; i < MAX_BONE_INFLUENCE; i++)
{
mat4 BoneMatrices = mat4(1.0f);
uint index = objectData[gl_InstanceID].frame * MAX_BONES + boneIds[i];
uint frameClipOffset = frameCount[objectData[gl_InstanceID].clip];
BoneMatrices = BonesMatrices1[frameClipOffset + index];
...
}
boneIds[i]为当前顶点所受的哪根骨骼影响(设定最多受4根)。
实例化数据缓冲的构建、绑定和更新
struct ObjectDataAnimation
{
uint32_t animation_clip_ = 0;
uint32_t frame_ = 0;
};
std::vector<ObjectDataAnimation> object_data_;
glCreateBuffers(1, &object_data_buffer_);
glNamedBufferStorage(object_data_buffer_, object_data_.size() * sizeof(ObjectDataAnimation),
object_data_.data(), GL_DYNAMIC_STORAGE_BIT);
glNamedBufferSubData(object_data_buffer_, 0, object_data_.size() * sizeof(ObjectDataAnimation),
object_data_.data());
间接渲染
使用glDrawElementsIndirect间接渲染,实现基于GPU的视锥剔除,提升渲染效率。即draw call的参数来源于gpu中的buffer。可以使用计算着色器进行视锥剔除,并将结果写到一个gpu的buffer中,渲染命令读取该buffer,进行渲染。不需要gpu的计算结果写回cpu中,减少了cpu与gpu之间的通信。
一般地,渲染指令参数buffer的内容为:
typedef struct {
uint count;
uint instanceCount;
uint firstIndex;
int baseVertex;
uint baseInstance;
} DrawElementsIndirectCommand;
创建和绑定间接渲染缓冲
DrawElementsIndirectCommand indirect_command_;
uint32_t indirect_buffer_ = -1;
glCreateBuffers(1, &indirect_buffer_);
glNamedBufferStorage(indirect_buffer_, sizeof(DrawElementsIndirectCommand), &indirect_command_,
GL_DYNAMIC_STORAGE_BIT);
glBindBuffer(GL_DRAW_INDIRECT_BUFFER, indirect_buffer_);
那么此时
glDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, 0);
相当于:
glDrawElementsInstancedBaseVertexBaseInstance(GL_TRIANGLES,
count,
GL_UNSIGNED_INT,
firstIndex,
instanceCount,
baseVertex,
baseInstance);
如果令firstIndex=0,baseVertex=0,baseInstance=0,那么此时就是实例化渲染命令,count是索引数,instanceCount是实例数。 间接渲染缓冲indirect_buffer的内容可以在计算着色器中读写。
视锥剔除
在计算着色器中,定义间接渲染缓冲:
// Same as the OpenGL defined struct: DrawElementsIndirectCommand
struct DrawCommand
{
uint count; // Num elements (vertices)
uint instanceCount; // Number of instances to draw (a.k.a primcount)
uint firstIndex; // Specifies a byte offset (cast to a pointer type) into the buffer bound to
// GL_ELEMENT_ARRAY_BUFFER to start reading indices from.
uint baseVertex; // Specifies a constant that should be added to each element of indices when chosing elements
// from the enabled vertex arrays.
uint baseInstance; // Specifies the base instance for use in fetching instanced vertex attributes.
};
layout(std430, binding = 2) buffer DrawCommandBuffer
{
DrawCommand drawCommand;
};
每一个实例进行可见性判定,若可见,则渲染的实例数加1,该实例编号写到一个缓冲中
layout(std430, binding = 1) buffer InstanceIDBuffer
{
uint ids[];
};
const uint idx = gl_GlobalInvocationID.x; // Compute space is 1D where x in [0, N)
const bool visible = IsVisible(idx);
if (visible)
{
uint instanceIndex = atomicAdd(drawCommand.instanceCount, 1);
ids[instanceIndex] = idx;
}
那么后面渲染时,实例编号就变为
uint instance_id = ids[gl_InstanceID];
可见性判定
每一个实例模型用一个球体包围,判定球体是否在相机范围内。 视锥剔除 球体的中心,半径为模型的aabb包围盒的中心,半对角长。结合每个模型的位置矩阵,相机参数:
struct CameraCullingData
{
glm::mat4 view;
alignas(16) float fov;
float aspect;
float znear, zfar;
glm::vec4 spherebounds;
uint32_t drawCount;
};
CameraCullingData camera_culling_data_;
camera_culling_data_.view = camera_->GetView();
camera_culling_data_.fov = glm::radians(camera_->GetFov());
camera_culling_data_.aspect = camera_->GetAspect();
camera_culling_data_.znear = camera_->GetNear();
camera_culling_data_.zfar = camera_->GetFar();
glm::vec3 aabb_min = mesh_->GetAABBMin();
glm::vec3 aabb_max = mesh_->GetAABBMax();
glm::vec3 center = (aabb_min + aabb_max) * 0.5f;
glm::vec3 extent = aabb_max - center;
float radius = glm::length(extent);
camera_culling_data_.spherebounds = glm::vec4(center, radius);
camera_culling_data_.drawCount = instance_transforms_.size();
视锥剔除缓冲构建、绑定和更新
glCreateBuffers(1, &camera_culling_data_buffer_);
glNamedBufferStorage(camera_culling_data_buffer_, sizeof(CameraCullingData),
&camera_culling_data_,GL_DYNAMIC_STORAGE_BIT);
glNamedBufferSubData(camera_culling_data_buffer_, 0, sizeof(CameraCullingData),
&camera_culling_data_);
glBindBufferBase(GL_UNIFORM_BUFFER, 3, camera_culling_data_buffer_);
可见性剔除
计算着色器中
struct CameraCullingData
{
mat4 view;
float fov;
float aspect;
float znear, zfar;
vec4 spherebounds;
uint drawCount;
};
layout(std140, binding = 3) uniform CameraCullingDataBuffer
{
CameraCullingData cullData;
};
layout(std430, binding = 0) buffer InstanceBuffer
{
mat4 modelMatrices[];
};
bool IsVisible(uint objectIndex)
{
uint index = objectIndex;
vec4 sphereBounds = cullData.spherebounds;
mat4 modelMatrix = modelMatrices[index];
vec3 center = sphereBounds.xyz;
vec4 centerMV = cullData.view * modelMatrix * vec4(center, 1.f);
center = centerMV.xyz / centerMV.w;
center.z *= -1;
float radius = max(modelMatrix[0][0]*sphereBounds.w, modelMatrix[1][1]*sphereBounds.w);
radius = max(radius, modelMatrix[2][2]*sphereBounds.w);
// z
if (center.z > cullData.zfar + radius || center.z < cullData.znear - radius)
return false;
float h = center.z * 2.f * tan(cullData.fov / 2.f);
float d = radius / cos(cullData.fov / 2.f);
// y
if (-h / 2.f - d > center.y || center.y > h / 2.f + d)
return false;
// x
float w = h * cullData.aspect;
d = d * cullData.aspect;
if (-w / 2.f - d > center.x || center.x > w / 2.f + d)
return false;
return true;
}
模型LOD
(mesh shader?)
这里使用meshoptimizer库,生成各个LOD的索引,即顶点缓冲不变,索引缓冲为LOD0的索引数组+LOD1的索引数组+LOD2的索引数组。
间接渲染时,先在计算着器进行基于距离的判断,选择LOD,写到对应的间接渲染命令的参数offset。
使用glMultiDrawElementsIndirect函数,来实现根据深度选择lod绘制。
相当于:
for (n = 0; n < drawcount; n++) {
const DrawElementsIndirectCommand *cmd;
cmd = (const DrawElementsIndirectCommand *)indirect + n;
glDrawElementsInstancedBaseVertexBaseInstance(mode,
cmd->count,
type,
cmd->firstIndex * size-of-type,
cmd->instanceCount,
cmd->baseVertex,
cmd->baseInstance);
}
生成lod
size_t index_count = indices_.size();
size_t vertex_count = vertices_.size();
std::vector<unsigned int> remap(index_count); // allocate temporary memory for the remap table
size_t remap_vertex_count = meshopt_generateVertexRemap(&remap[0], indices_.data(), index_count, &vertices_[0],
vertex_count, sizeof(VertexSkeletal));
std::vector<unsigned int> indices_new(index_count);
meshopt_remapIndexBuffer(indices_new.data(), indices_.data(), index_count, &remap[0]);
std::vector<VertexSkeletal> vertices_new(remap_vertex_count);
meshopt_remapVertexBuffer(vertices_new.data(), &vertices_[0], vertex_count, sizeof(VertexSkeletal), &remap[0]);
vertex_count = remap_vertex_count;
vertices_ = vertices_new;
indices_ = indices_new;
meshopt_optimizeVertexCache(indices_.data(), indices_.data(), index_count, vertex_count);
meshopt_optimizeOverdraw(indices_.data(), indices_.data(), index_count, &vertices_[0].position_.x, vertex_count,
sizeof(VertexSkeletal), 1.05f);
meshopt_optimizeVertexFetch(vertices_.data(), indices_.data(), index_count, vertices_.data(), vertex_count,
sizeof(VertexSkeletal));
lods_.push_back(indices_.size());
float errors[3] = {0.001, 0.0125f, 0.015f};
for (int i = 0; i < 3; i++)
{
float threshold = 0;
size_t target_index_count = size_t(index_count * threshold);
float target_error = errors[i];
unsigned int options = 0; // meshopt_SimplifyX flags, 0 is a safe default
float lod_error = 0.f;
std::vector<unsigned int> lod(index_count);
lod.resize(meshopt_simplify(&lod[0], indices_.data(), index_count, &vertices_[0].position_.x, vertices_.size(),
sizeof(VertexSkeletal), target_index_count, target_error, options, &lod_error));
lods_.push_back(lod.size());
indices_.insert(indices_.end(), lod.begin(), lod.end());
}
lods[i]该级lod模型的索引数。
选择lod
在视锥剔除后,若可见,可以根据深度,选择lod。(我这里的例子跳过了原本的模型,即少于25米时;第一级,25~50米时,第二级;>50米时,第三级)
int Lod(uint index)
{
vec4 sphereBounds = cullData.spherebounds;
mat4 modelMatrix = modelMatrices[index];
vec3 center = sphereBounds.xyz;
vec4 centerMV = cullData.view * modelMatrix * vec4(center, 1.f);
center = centerMV.xyz / centerMV.w;
center.z *= -1;
float radius = sphereBounds.w;
// z
if (center.z < 25.f)
{
return 1;
}
else if (center.z < 50.f)
{
return 2;
}
else
{
return 3;
}
}
写到对应的间接渲染缓冲
if (visible)
{
int lod = Lod(idx);
uint instanceIndex = atomicAdd(drawCommands[lod].instanceCount, 1);
ids[lod * cullData.drawCount + instanceIndex] = idx;
}
那么后面渲染时,实例编号就变为
uint instance_id = ids[gl_DrawID * cullData.drawCount + gl_InstanceID];
gl_DrawID为间接渲染draw call 编号,cullData.drawCount即为总实例数。
举例,若10000万个实例,lod有4级。那么
glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, 0, 4, 0);
drawCommands[4],间接渲染缓冲长度为4*sizeof(DrawCommand)。 ids[4*10000],实例编号缓冲长度为4*10000*sizeof(int)。比如,lod2级有100个实例,那么,第10000~10100个位置,存放这100个实例的编号,DrawCommands[2].instance_count=100。
性能分析
以1920x1080的分辨率,开启16倍MSAA,运行程序。
使用RenderDoc截帧分析,可以看到有一万个模型(由身体,剑,盾牌,头盔组成),每个模型大约有2万个顶点
使用Nsight Graphics截帧分析,查看约耗时10ms,gpu剔除仅0.02ms
使用任务管理器,查看此时GPU RTX 2060的使用情况
由于使用间接渲染,CPU端的耗时主要由更新一万个模型的位姿矩阵(16x4 byte),动画片段序号(4 byte)和帧号(4 byte)组成,10000x72 byte = 700KB