群体角色动画实时渲染技术研究

279 阅读8分钟

群体角色动画实时渲染技术研究

本文基于OpenGL 460,DSA模式,实现一万个士兵的动画渲染。
项目地址:limingyin18/car (github.com) Recording2024-09-21043111-ezgif.com-optimize.gif
方法主要由实例化渲染,模型LOD,间接渲染,基于GPU的视锥剔除,角色动画构成。 基本的opengl使用,模型读取,骨骼动画暂时不在这里写,参考已经有很好的资料。

实例化渲染

使用glDrawElementsInstanced函数,进行实例化渲染。即渲染相同网格,相同材质,但是位置不一样时,只调用一次draw call,不需要重复调用glDrawElements,减少CPU与GPU之间的通信,提升了速度。 一般地:

glDrawElementsInstanced(GL_TRIANGLES, indices_count, GL_UNSIGNED_INT, 0, instance_count);

相当于:

for (int i = 0; i < instance_count ; i++) {
    glDrawElements(GL_TRIANGLES, indices_count, GL_UNSIGNED_INT, 0);
}

在顶点着色器中,由内建变量gl_InstanceID来区分当前正在进行哪个实例的渲染。可以用来索引数组,如位姿矩阵:

layout(std430, binding = 0) buffer InstanceBuffer
{
    mat4 modelMatrices[];
};

mat4 model = modelMatrices[gl_InstanceID];

位姿矩阵缓冲使用ssbo类型,其创建、绑定和更新是:

glCreateBuffers(1, &instance_transforms_buffer_);
glNamedBufferStorage(instance_transforms_buffer_, instance_transforms_.size() * sizeof(glm::mat4),
                     instance_transforms_.data(), GL_DYNAMIC_STORAGE_BIT);
                     
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, instance_transforms_buffer_);                     

glNamedBufferSubData(instance_transforms_buffer_, 0, instance_transforms_.size() * sizeof(glm::mat4),
                     instance_transforms_.data());

可以让美术制作纹理,相当于色板,能够偏移改变模型某些区域的颜色,从而实现如士兵在不同的阵营等。 (拓展,无绑定,虚拟纹理virtual texture?)

动画

由于使用实例化渲染,动画要先写到ssbo中,才能被gl_InstanceID索引。

动画烘培

    // bake animation
    float ticks_per_second = animations_[animation_clip_type]->GetTicksPerSecond();
    float dt = 10.0f / ticks_per_second;
    for (int i = 0; i <= frame_counts_[animation_clip_type] / 10; i++)
    {
        if (i == 0)
        {
            animator_->UpdateAnimation(0);
        }
        else
        {
            animator_->UpdateAnimation(dt);
        }
        auto transforms = animator_->GetFinalBoneMatrices();

        for (int j = 0; j < MAX_BONES; j++)
        {
            bone_transforms_[animation_clip_type][i * MAX_BONES + j] = transforms[j];
        }
    }

i是第几帧,j是第几个骨骼(设定最多100个),即第2个动画片段,第3帧,第5号骨骼的位姿矩阵是bone_transforms_[2][3 * 100 + 5]
烘培是比较费时间的,所以一般这个数据会保存到硬盘中,程序启动时读取。

动画缓冲的构建、绑定

    frame_count_offset_.resize(bone_transforms_.size() + 1);
    frame_count_offset_[0] = 0;
    for (size_t i = 1; i < bone_transforms_.size() + 1; i++)
    {
        frame_count_offset_[i] = frame_count_offset_[i - 1] + bone_transforms_[i - 1].size();
    }
    glCreateBuffers(1, &frame_count_offset_buffer_object_);
    glNamedBufferStorage(frame_count_offset_buffer_object_, frame_count_offset_.size() * sizeof(uint32_t),
                         frame_count_offset_.data(), 0);

    glCreateBuffers(1, &bone_transforms_buffer_object_);
    glNamedBufferStorage(bone_transforms_buffer_object_, frame_count_offset_.back() * sizeof(glm::mat4), 0,
                         GL_DYNAMIC_STORAGE_BIT);
    for (size_t i = 0; i < bone_transforms_.size(); i++)
    {
        glNamedBufferSubData(bone_transforms_buffer_object_, frame_count_offset_[i] * sizeof(glm::mat4),
                             bone_transforms_[i].size() * sizeof(glm::mat4), bone_transforms_[i].data());
    }
    
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, bone_transforms_buffer_object_);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, frame_count_offset_buffer_object_);

frame_count_offset是每个动画片段的帧数累加,从0开始。将所有动画片段都写到一个缓冲,使用frame_count_offset区分。

实例化使用

在着色器中,先定义每个实例的数据:动画的片段,当前帧

struct ObjectData
{
    uint clip;
    uint frame;
};
layout(std430, binding = 2) buffer ObjectDataBuffer
{
    ObjectData objectData[];
};
layout(location = 5) in ivec4 boneIds;

    for (int i = 0; i < MAX_BONE_INFLUENCE; i++)
    {
        mat4 BoneMatrices = mat4(1.0f);
        uint index = objectData[gl_InstanceID].frame * MAX_BONES + boneIds[i];
        uint frameClipOffset = frameCount[objectData[gl_InstanceID].clip];
        BoneMatrices = BonesMatrices1[frameClipOffset + index];
        ...
    }

boneIds[i]为当前顶点所受的哪根骨骼影响(设定最多受4根)。

实例化数据缓冲的构建、绑定和更新

struct ObjectDataAnimation
{
    uint32_t animation_clip_ = 0;
    uint32_t frame_ = 0;
};
std::vector<ObjectDataAnimation> object_data_;

glCreateBuffers(1, &object_data_buffer_);
glNamedBufferStorage(object_data_buffer_, object_data_.size() * sizeof(ObjectDataAnimation), 
                     object_data_.data(), GL_DYNAMIC_STORAGE_BIT);
                         
glNamedBufferSubData(object_data_buffer_, 0, object_data_.size() * sizeof(ObjectDataAnimation),
                     object_data_.data());

间接渲染

使用glDrawElementsIndirect间接渲染,实现基于GPU的视锥剔除,提升渲染效率。即draw call的参数来源于gpu中的buffer。可以使用计算着色器进行视锥剔除,并将结果写到一个gpu的buffer中,渲染命令读取该buffer,进行渲染。不需要gpu的计算结果写回cpu中,减少了cpu与gpu之间的通信。
一般地,渲染指令参数buffer的内容为:

typedef  struct {
    uint  count;
    uint  instanceCount;
    uint  firstIndex;
    int  baseVertex;
    uint  baseInstance;
} DrawElementsIndirectCommand;

创建和绑定间接渲染缓冲

DrawElementsIndirectCommand indirect_command_;
uint32_t indirect_buffer_ = -1;
glCreateBuffers(1, &indirect_buffer_);
glNamedBufferStorage(indirect_buffer_, sizeof(DrawElementsIndirectCommand), &indirect_command_,
                     GL_DYNAMIC_STORAGE_BIT);
glBindBuffer(GL_DRAW_INDIRECT_BUFFER, indirect_buffer_);

那么此时

glDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, 0);

相当于:

glDrawElementsInstancedBaseVertexBaseInstance(GL_TRIANGLES,
                                              count,
                                              GL_UNSIGNED_INT,
                                              firstIndex,
                                              instanceCount,
                                              baseVertex,
                                              baseInstance);

如果令firstIndex=0,baseVertex=0,baseInstance=0,那么此时就是实例化渲染命令,count是索引数,instanceCount是实例数。 间接渲染缓冲indirect_buffer的内容可以在计算着色器中读写。

视锥剔除

在计算着色器中,定义间接渲染缓冲:

// Same as the OpenGL defined struct: DrawElementsIndirectCommand
struct DrawCommand
{
    uint count;         // Num elements (vertices)
    uint instanceCount; // Number of instances to draw (a.k.a primcount)
    uint firstIndex;    // Specifies a byte offset (cast to a pointer type) into the buffer bound to
                        // GL_ELEMENT_ARRAY_BUFFER to start reading indices from.
    uint baseVertex;    // Specifies a constant that should be added to each element of indices  when chosing elements
                        // from the enabled vertex arrays.
    uint baseInstance;  // Specifies the base instance for use in fetching instanced vertex attributes.
};

layout(std430, binding = 2) buffer DrawCommandBuffer
{
    DrawCommand drawCommand;
};

每一个实例进行可见性判定,若可见,则渲染的实例数加1,该实例编号写到一个缓冲中

layout(std430, binding = 1) buffer InstanceIDBuffer
{
    uint ids[];
};

const uint idx = gl_GlobalInvocationID.x; // Compute space is 1D where x in [0, N)
const bool visible = IsVisible(idx);

if (visible)
{
    uint instanceIndex = atomicAdd(drawCommand.instanceCount, 1);
    ids[instanceIndex] = idx;
}

那么后面渲染时,实例编号就变为

uint instance_id = ids[gl_InstanceID];

可见性判定

每一个实例模型用一个球体包围,判定球体是否在相机范围内。 视锥剔除 球体的中心,半径为模型的aabb包围盒的中心,半对角长。结合每个模型的位置矩阵,相机参数:

struct CameraCullingData
{
    glm::mat4 view;
    alignas(16) float fov;
    float aspect;
    float znear, zfar;

    glm::vec4 spherebounds;
    uint32_t drawCount;
};
CameraCullingData camera_culling_data_;

camera_culling_data_.view = camera_->GetView();
camera_culling_data_.fov = glm::radians(camera_->GetFov());
camera_culling_data_.aspect = camera_->GetAspect();
camera_culling_data_.znear = camera_->GetNear();
camera_culling_data_.zfar = camera_->GetFar();

glm::vec3 aabb_min = mesh_->GetAABBMin();
glm::vec3 aabb_max = mesh_->GetAABBMax();
glm::vec3 center = (aabb_min + aabb_max) * 0.5f;
glm::vec3 extent = aabb_max - center;
float radius = glm::length(extent);
camera_culling_data_.spherebounds = glm::vec4(center, radius);
camera_culling_data_.drawCount = instance_transforms_.size();
视锥剔除缓冲构建、绑定和更新
glCreateBuffers(1, &camera_culling_data_buffer_);
glNamedBufferStorage(camera_culling_data_buffer_, sizeof(CameraCullingData), 
                     &camera_culling_data_,GL_DYNAMIC_STORAGE_BIT);
                     
glNamedBufferSubData(camera_culling_data_buffer_, 0, sizeof(CameraCullingData), 
                     &camera_culling_data_);
                     
glBindBufferBase(GL_UNIFORM_BUFFER, 3, camera_culling_data_buffer_);
可见性剔除

计算着色器中

struct CameraCullingData
{
    mat4 view;
    float fov;
    float aspect;
    float znear, zfar;

    vec4 spherebounds;
    uint drawCount;
};

layout(std140, binding = 3) uniform CameraCullingDataBuffer
{
    CameraCullingData cullData;
};

layout(std430, binding = 0) buffer InstanceBuffer
{
    mat4 modelMatrices[];
};

bool IsVisible(uint objectIndex)
{
    uint index = objectIndex;

    vec4 sphereBounds = cullData.spherebounds;
    mat4 modelMatrix = modelMatrices[index];

    vec3 center = sphereBounds.xyz;
    vec4 centerMV = cullData.view * modelMatrix * vec4(center, 1.f);
    center = centerMV.xyz / centerMV.w;
    center.z *= -1;
    float radius = max(modelMatrix[0][0]*sphereBounds.w, modelMatrix[1][1]*sphereBounds.w);
    radius = max(radius, modelMatrix[2][2]*sphereBounds.w);

    // z
    if (center.z > cullData.zfar + radius || center.z < cullData.znear - radius)
        return false;

    float h = center.z * 2.f * tan(cullData.fov / 2.f);
    float d = radius / cos(cullData.fov / 2.f);
    // y
    if (-h / 2.f - d > center.y || center.y > h / 2.f + d)
        return false;

    // x
    float w = h * cullData.aspect;
    d = d * cullData.aspect;
    if (-w / 2.f - d > center.x || center.x > w / 2.f + d)
        return false;

    return true;
}

模型LOD

(mesh shader?)
这里使用meshoptimizer库,生成各个LOD的索引,即顶点缓冲不变,索引缓冲为LOD0的索引数组+LOD1的索引数组+LOD2的索引数组。 间接渲染时,先在计算着器进行基于距离的判断,选择LOD,写到对应的间接渲染命令的参数offset。

使用glMultiDrawElementsIndirect函数,来实现根据深度选择lod绘制。
相当于:

    for (n = 0; n < drawcount; n++) {
        const DrawElementsIndirectCommand *cmd;
        cmd = (const DrawElementsIndirectCommand  *)indirect + n;
        glDrawElementsInstancedBaseVertexBaseInstance(mode,
                                                      cmd->count,
                                                      type,
                                                      cmd->firstIndex * size-of-type,
                                                      cmd->instanceCount,
                                                      cmd->baseVertex,
                                                      cmd->baseInstance);
    }

生成lod

    size_t index_count = indices_.size();
    size_t vertex_count = vertices_.size();

    std::vector<unsigned int> remap(index_count); // allocate temporary memory for the remap table
    size_t remap_vertex_count = meshopt_generateVertexRemap(&remap[0], indices_.data(), index_count, &vertices_[0],
                                                            vertex_count, sizeof(VertexSkeletal));
    std::vector<unsigned int> indices_new(index_count);
    meshopt_remapIndexBuffer(indices_new.data(), indices_.data(), index_count, &remap[0]);
    std::vector<VertexSkeletal> vertices_new(remap_vertex_count);
    meshopt_remapVertexBuffer(vertices_new.data(), &vertices_[0], vertex_count, sizeof(VertexSkeletal), &remap[0]);

    vertex_count = remap_vertex_count;
    vertices_ = vertices_new;
    indices_ = indices_new;

    meshopt_optimizeVertexCache(indices_.data(), indices_.data(), index_count, vertex_count);
    meshopt_optimizeOverdraw(indices_.data(), indices_.data(), index_count, &vertices_[0].position_.x, vertex_count,
                             sizeof(VertexSkeletal), 1.05f);
    meshopt_optimizeVertexFetch(vertices_.data(), indices_.data(), index_count, vertices_.data(), vertex_count,
                                sizeof(VertexSkeletal));

    lods_.push_back(indices_.size());

    float errors[3] = {0.001, 0.0125f, 0.015f};
    for (int i = 0; i < 3; i++)
    {
        float threshold = 0;
        size_t target_index_count = size_t(index_count * threshold);
        float target_error = errors[i];
        unsigned int options = 0; // meshopt_SimplifyX flags, 0 is a safe default
        float lod_error = 0.f;
        std::vector<unsigned int> lod(index_count);

        lod.resize(meshopt_simplify(&lod[0], indices_.data(), index_count, &vertices_[0].position_.x, vertices_.size(),
                                    sizeof(VertexSkeletal), target_index_count, target_error, options, &lod_error));

        lods_.push_back(lod.size());
        indices_.insert(indices_.end(), lod.begin(), lod.end());
    }

lods[i]该级lod模型的索引数。

选择lod

在视锥剔除后,若可见,可以根据深度,选择lod。(我这里的例子跳过了原本的模型,即少于25米时;第一级,25~50米时,第二级;>50米时,第三级)

int Lod(uint index)
{
    vec4 sphereBounds = cullData.spherebounds;
    mat4 modelMatrix = modelMatrices[index];

    vec3 center = sphereBounds.xyz;
    vec4 centerMV = cullData.view * modelMatrix * vec4(center, 1.f);
    center = centerMV.xyz / centerMV.w;
    center.z *= -1;
    float radius = sphereBounds.w;

    // z
    if (center.z < 25.f)
    {
        return 1;
    }
    else if (center.z < 50.f)
    {
        return 2;
    }
    else
    {
        return 3;
    }
}

写到对应的间接渲染缓冲

    if (visible)
    {
        int lod = Lod(idx);
        uint instanceIndex = atomicAdd(drawCommands[lod].instanceCount, 1);
        ids[lod * cullData.drawCount + instanceIndex] = idx;
    }

那么后面渲染时,实例编号就变为

uint instance_id = ids[gl_DrawID * cullData.drawCount + gl_InstanceID];

gl_DrawID为间接渲染draw call 编号,cullData.drawCount即为总实例数。
举例,若10000万个实例,lod有4级。那么

glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, 0, 4, 0);

drawCommands[4],间接渲染缓冲长度为4*sizeof(DrawCommand)。 ids[4*10000],实例编号缓冲长度为4*10000*sizeof(int)。比如,lod2级有100个实例,那么,第10000~10100个位置,存放这100个实例的编号,DrawCommands[2].instance_count=100。

性能分析

以1920x1080的分辨率,开启16倍MSAA,运行程序。

Screenshot 2024-09-21 050704.png
使用RenderDoc截帧分析,可以看到有一万个模型(由身体,剑,盾牌,头盔组成),每个模型大约有2万个顶点

Screenshot 2024-09-21 044608.png
Screenshot 2024-09-21 051609.png 使用Nsight Graphics截帧分析,查看约耗时10ms,gpu剔除仅0.02ms Screenshot 2024-09-21 050018.png
使用任务管理器,查看此时GPU RTX 2060的使用情况

由于使用间接渲染,CPU端的耗时主要由更新一万个模型的位姿矩阵(16x4 byte),动画片段序号(4 byte)和帧号(4 byte)组成,10000x72 byte = 700KB

参考

  1. LearnOpenGL - Instancing
  2. LearnOpenGL - Model
  3. LearnOpenGL - Skeletal Animation
  4. Draw Indirect - Vulkan Guide (vkguide.dev)
  5. Compute based Culling - Vulkan Guide (vkguide.dev)