命令流

图形设备命令不能直接写入到硬件中，是写入一个环形的buffer中。命令解析器发起DMA读取命令。这个环形buffer在内核驱动中叫做ring buffer。 ring buffer 中可以写入命令让命令解析器去指定的地址读取存放命令的buffer，这个另外的buffer 叫做batch buffer。

ring buffer

ring buffer 的地址是被ring buffer 寄存器定义的，定义了ring buffer 的size（start和lenth）还包含了两个offset（head和tail）。命令流解析器通过head和tail 来判断当前是否有必须要执行的命令。一个rq要执行的命令包含在head和tail 内存区域中。当head和tail相等表示当前的request 的命令执行完成。

每个引擎都有自己的ring buffer 相关的寄存器。都有一个自己的ringbuffer。根据intel 的文档介绍 ring buffer 不支持 system cache 必须是no cache

batch buffer

从MI_BATCH_BUFFER_START batch buffer开始。 MI_BATCH_BUFFER_END batch buffer 结束。

用户态batch buffer

用户态程序通过OpenGL API调用完成状态机设置，mesa中iris 用户态驱动将命令写入到batchbuffer中。用户态一个batchbuffer 可以由多个bo 组成。

每个用户态的context 会创建3个batch buffer，分别对应render 引擎，计算引擎，blitter引擎。三个batch buffer 之间可能存在依赖关系，比如一个bo 在render 引擎的batch中但是也在blitter 引擎的batch buffer中，这个时候需要判断依赖关系，一旦有依赖就需要将依赖的batch buffer 先提交给gpu。每个batch buffer 都会绑定当前gem context，这样batch 下发到内核驱动时候，内核驱动就知道这个batch buffer 是那个context的。

mesa 中intel batch 操作都在iris_batch.c 文件中。无论是命令流还是数据写入到batch的bo中。将一个bo 加入到batch 中是add_bo_to_batch函数实现的，这个bo 可以存放命令，也可以存放数据。

static void
add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable)
{
   assert(batch->exec_array_size > batch->exec_count);

   iris_bo_reference(bo);//bo 引用计数加1

   batch->exec_bos[batch->exec_count] = bo; //加入到batch buffer 使用的bo list中

   if (writable) //如果batch bo 有写操作 需要设置标志
      BITSET_SET(batch->bos_written, batch->exec_count);

   bo->index = batch->exec_count;
   batch->exec_count++;//batch 一共包含多少个bo
   batch->aperture_space += bo->size; //batch 中的bo 总共size大小

   batch->max_gem_handle =
      MAX2(batch->max_gem_handle, iris_get_backing_bo(bo)->gem_handle);
}

gpu command 填写

如果是gpu command的下发，会先调用iris_chain_to_new_batch 函数，分配一个bo 给batch buffer。

void
iris_chain_to_new_batch(struct iris_batch *batch)
{
   uint32_t *cmd = batch->map_next;
   uint64_t *addr = batch->map_next + 4;
   batch->map_next += 12;

   record_batch_sizes(batch);

   /* No longer held by batch->bo, still held by validation list */
   iris_bo_unreference(batch->bo);
   create_batch(batch);//创建一个新的bo加入到batch buffer中

   /* Emit MI_BATCH_BUFFER_START to chain to another batch. */
   *cmd = (0x31 << 23) | (1 << 8) | (3 - 2);//创建一个新的bo后将新bo ppgtt address 写入到上一个bo 中，这样gpu 命令解析器自动跳转到下一个bo 解析命令。 写入MI_BATCH_BUFFER_START 命令后后面跟bo 的地址。
   *addr = batch->bo->address; 
}

create_batch(struct iris_batch *batch)
{
   struct iris_screen *screen = batch->screen;
   struct iris_bufmgr *bufmgr = screen->bufmgr;

   /* TODO: We probably could suballocate batches... */
   batch->bo = iris_bo_alloc(bufmgr, "command buffer",
                             BATCH_SZ + BATCH_RESERVED, 8,
                             IRIS_MEMZONE_OTHER, BO_ALLOC_NO_SUBALLOC);
   iris_get_backing_bo(batch->bo)->real.kflags |= EXEC_OBJECT_CAPTURE;
   batch->map = iris_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE); //map 这个bo cpu 可以读写
   batch->map_next = batch->map;//map 是当前使用的起始地址，map next = map + 这次写入命令需要的size

   ensure_exec_obj_space(batch, 1);//确保batch 中有足够的构建插入这个bo 的一些信息
   add_bo_to_batch(batch, batch->bo, false);//将这个bo 加入到batch list中
}

生成完成一个bo给batch buffer 后就可以调用iris_get_command_space 回去特定大小的空间，然后直接写入命令

iris_get_command_space(struct iris_batch *batch, unsigned bytes)
{
   iris_require_command_space(batch, bytes);
   void *map = batch->map_next;
   batch->map_next += bytes;
   return map;
}

通过 blorp_emit_dwords 预先分配数据拷贝需要的空间

blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n)
{
   struct iris_batch *batch = blorp_batch->driver_batch;
   return iris_get_command_space(batch, n * sizeof(uint32_t));
}

#define blorp_emit(batch, cmd, name)                              \
   for (struct cmd name = { _blorp_cmd_header(cmd) },             \
        *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
        __builtin_expect(_dst != NULL, 1);                        \
        _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
        _dst = NULL)

mesa中通过blorp_emit 将3D 命令写入到batch buffer中。

gpu bind table/surface 填写

batch 中除了存放gpu command 还包含了一些bind table，surface 等信息，这些已经有现成的bo，通过iris_use_pinned_bo 将现成的bo 加入到batch list中。

提交batch buffer

在mesa intel 驱动中submit_batch 函数负责执行ioctl 将batch 下发给驱动。

submit_batch(struct iris_batch *batch)
{
   struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
   simple_mtx_t *bo_deps_lock = iris_bufmgr_get_bo_deps_lock(bufmgr);

   iris_bo_unmap(batch->bo);

   struct drm_i915_gem_exec_object2 *validation_list =
      malloc(batch->exec_count * sizeof(*validation_list));

   unsigned *index_for_handle =
      calloc(batch->max_gem_handle + 1, sizeof(unsigned));

   unsigned validation_count = 0;
   for (int i = 0; i < batch->exec_count; i++) { //遍历所有加入到batch 中的bo
      struct iris_bo *bo = iris_get_backing_bo(batch->exec_bos[i]);
      assert(bo->gem_handle != 0);

      bool written = BITSET_TEST(batch->bos_written, i);
      unsigned prev_index = index_for_handle[bo->gem_handle];
      if (prev_index > 0) {
         if (written)
            validation_list[prev_index].flags |= EXEC_OBJECT_WRITE; //任何对render targets 有写操作的都要被标记为EXEC_OBJECT_WRITE
      } else {
         index_for_handle[bo->gem_handle] = validation_count;
         validation_list[validation_count] =
            (struct drm_i915_gem_exec_object2) { //将bo 的gem handle，在gpu ppgtt 使用的虚拟地址 等写入到drm_i915_gem_exec_object2中
               .handle = bo->gem_handle,
               .offset = bo->address, //内核也会使用这个地址做ppgtt 映射。
               .flags  = bo->real.kflags | (written ? EXEC_OBJECT_WRITE : 0) |
                         (iris_bo_is_external(bo) ? 0 : EXEC_OBJECT_ASYNC),
            };
         ++validation_count;
      }
   }

   free(index_for_handle);

 
   struct drm_i915_gem_execbuffer2 execbuf = {
      .buffers_ptr = (uintptr_t) validation_list,
      .buffer_count = validation_count,
      .batch_start_offset = 0,
      /* This must be QWord aligned. */
      .batch_len = ALIGN(batch->primary_batch_size, 8),
      .flags = batch->exec_flags |
               I915_EXEC_NO_RELOC | //禁止重新分配vma，保证用户态看到offset address 和内核一致
               I915_EXEC_BATCH_FIRST | //从bathbuffer 第一个bo 开始执行
               I915_EXEC_HANDLE_LUT,
      .rsvd1 = batch->ctx_id, /* rsvd1 保存batch 对应的gem context id*/
   };

   if (num_fences(batch)) {
      execbuf.flags |= I915_EXEC_FENCE_ARRAY;
      execbuf.num_cliprects = num_fences(batch);
      execbuf.cliprects_ptr =
         (uintptr_t)util_dynarray_begin(&batch->exec_fences);
   }

   int ret = 0;
   if (!batch->screen->devinfo.no_hw &&
       intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
      ret = -errno;

 ....
   return ret;
}

内核驱动处理batch buffer

i915_gem_do_execbuffer函数处理用户态下发的batchbuffer，用户态的batch buffer，可能使用了很多bo。每个bo 在分配的时候都有一个gem handle。内核通过handle 找到对应的obj。

graph TD
i915_gem_do_execbuffer -->eb_select_context --> eb_lookup_vmas --> eb_pin_vma--> i915_request_create --> eb_submit --> i915_request_add

eb_select_context根据下发batch buffer中gem context id 选择intel context。 eb_lookup_vmas 判断batch buffer 带的bo 有没有已经分配vma，如果内核还没有给这个bo 分配vma，则给分配出来。 i915_request_create 创建一个request请求，创建rq的时候从引擎的ring buffer空间中预先分配了一个空间。intel_ring_begin来获取这个地址并且填写ring buffer。eb_submit来指定ring buffer要读取的batch buffer 地址。

static int eb_submit(struct i915_execbuffer *eb)
{
	int err;

	err = eb_move_to_gpu(eb); //将vma 和request 进行绑定， rq 完成后vma 引用计数才会变成0
	if (err)
		return err;

	.....

	err = eb->engine->emit_bb_start(eb->request,
					eb->batch->node.start + //eb->batch 是第一个bo 的vma，node.start 是这个bo 的ppgtt 虚拟地址
					eb->batch_start_offset,
					eb->batch_len,
					eb->batch_flags);
	if (err)
		return err;

	return 0;
}
gen9_emit_bb_start 填写一个rq 的ring buffer。
static int gen9_emit_bb_start(struct i915_request *rq,
			      u64 offset, u32 len,
			      const unsigned int flags)
{
	u32 *cs;

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;

	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
	*cs++ = lower_32_bits(offset);
	*cs++ = upper_32_bits(offset);

	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
	*cs++ = MI_NOOP;

	intel_ring_advance(rq, cs);

	return 0;
}

intel gpu batchbuffer/ringbuffer

命令流