ironic task_manager 源码分析

10 阅读3分钟

TaskManager 介绍

TaskManager 类是 Ironic 中用于管理节点操作的核心类。它提供了一个上下文管理器,用于获取和释放节点锁,加载驱动程序,以及执行节点操作。以下是对 TaskManager 类及其主要方法的详细解析

class TaskManager(object):
    """Context manager for tasks.

    This class wraps the locking, driver loading, and acquisition
    of related resources (eg, Node and Ports) when beginning a unit of work.

    """

    def __init__(self, context, node_id, shared=False,
                 purpose='unspecified action', retry=True, patient=False,
                 load_driver=True):

        self._spawn_method = None
        self._on_error_method = None

        self.context = context
        self._node = None
        self._ports = None
        self._portgroups = None
        self._volume_connectors = None
        self._volume_targets = None
        self.node_id = node_id
        self.shared = shared
        self._retry = retry
        self._patient = patient

        self.fsm = states.machine.copy()
        self._purpose = purpose
        self._debug_timer = timeutils.StopWatch()

        # states and event for notification
        self._prev_provision_state = None
        self._prev_target_provision_state = None
        self._event = None
        self._saved_node = None

        try:
            node = objects.Node.get(context, node_id)
            LOG.debug("Attempting to get %(type)s lock on node %(node)s (for "
                      "%(purpose)s)",
                      {'type': 'shared' if shared else 'exclusive',
                       'node': node.uuid, 'purpose': purpose})
            if not self.shared:
                self._lock()
            else:
                self._debug_timer.restart()
                self.node = node

            if load_driver:
                self.driver = driver_factory.build_driver_for_task(self)
            else:
                self.driver = None

        except Exception:
            with excutils.save_and_reraise_exception():
                self.release_resources()

    @property
    def node(self):
        return self._node

    @node.setter
    def node(self, node):
        self._node = node
        if node is not None:
            self.fsm.initialize(start_state=self.node.provision_state,
                                target_state=self.node.target_provision_state)

    @property
    def ports(self):
        try:
            if self._ports is None:
                self._ports = objects.Port.list_by_node_id(self.context,
                                                           self.node.id)
        except Exception:
            with excutils.save_and_reraise_exception():
                self.release_resources()
        return self._ports

    @ports.setter
    def ports(self, ports):
        self._ports = ports

    @property
    def portgroups(self):
        try:
            if self._portgroups is None:
                self._portgroups = objects.Portgroup.list_by_node_id(
                    self.context, self.node.id)
        except Exception:
            with excutils.save_and_reraise_exception():
                self.release_resources()
        return self._portgroups

    @portgroups.setter
    def portgroups(self, portgroups):
        self._portgroups = portgroups

    @property
    def volume_connectors(self):
        try:
            if self._volume_connectors is None:
                self._volume_connectors = \
                    objects.VolumeConnector.list_by_node_id(
                        self.context, self.node.id)
        except Exception:
            with excutils.save_and_reraise_exception():
                self.release_resources()
        return self._volume_connectors

    @volume_connectors.setter
    def volume_connectors(self, volume_connectors):
        self._volume_connectors = volume_connectors

    @property
    def volume_targets(self):
        try:
            if self._volume_targets is None:
                self._volume_targets = objects.VolumeTarget.list_by_node_id(
                    self.context, self.node.id)
        except Exception:
            with excutils.save_and_reraise_exception():
                self.release_resources()
        return self._volume_targets

    @volume_targets.setter
    def volume_targets(self, volume_targets):
        self._volume_targets = volume_targets

    def load_driver(self):
        if self.driver is None:
            self.driver = driver_factory.build_driver_for_task(self)

    def _lock(self):
        self._debug_timer.restart()

        if self._patient:
            stop_after = tenacity.stop_never
        elif self._retry:
            stop_after = tenacity.stop_after_attempt(
                CONF.conductor.node_locked_retry_attempts)
        else:
            stop_after = tenacity.stop_after_attempt(1)

        max_lock_time = \
            CONF.conductor.node_locked_retry_interval * \
            CONF.conductor.node_locked_retry_attempts

        @tenacity.retry(
            retry=tenacity.retry_if_exception_type(exception.NodeLocked),
            stop=stop_after,
            wait=tenacity.wait_fixed(
                CONF.conductor.node_locked_retry_interval),
            reraise=True)
        def reserve_node():
            if self._debug_timer.elapsed() > max_lock_time:
                LOG.warning('We have exceeded the normal maximum time window '
                            'to complete a node lock attempting to reserve '
                            'node %(node)s for purpose %(purpose)s. At '
                            '%(time).2f seconds.',
                            {'node': self.node_id, 'purpose': self._purpose,
                             'time': self._debug_timer.elapsed()})
            self.node = objects.Node.reserve(self.context, CONF.host,
                                             self.node_id)
            LOG.debug("Node %(node)s successfully reserved for %(purpose)s "
                      "(took %(time).2f seconds)",
                      {'node': self.node.uuid, 'purpose': self._purpose,
                       'time': self._debug_timer.elapsed()})
            self._debug_timer.restart()

        reserve_node()

    def upgrade_lock(self, purpose=None, retry=None):

        if purpose is not None:
            self._purpose = purpose
        if retry is not None:
            self._retry = retry

        if self.shared:
            LOG.debug('Upgrading shared lock on node %(uuid)s for %(purpose)s '
                      'to an exclusive one (shared lock was held %(time).2f '
                      'seconds)',
                      {'uuid': self.node.uuid, 'purpose': self._purpose,
                       'time': self._debug_timer.elapsed()})
            self._lock()
            self.shared = False

    def spawn_after(self, _spawn_method, *args, **kwargs):

        self._spawn_method = _spawn_method
        self._spawn_args = args
        self._spawn_kwargs = kwargs

    def set_spawn_error_hook(self, _on_error_method, *args, **kwargs):

        self._on_error_method = _on_error_method
        self._on_error_args = args
        self._on_error_kwargs = kwargs

    def downgrade_lock(self):
        """Downgrade the lock to a shared one."""
        if self.node is None:
            raise RuntimeError("Cannot downgrade an already released lock")

        if not self.shared:
            objects.Node.release(self.context, CONF.host, self.node.id)
            self.shared = True
            self.node.refresh()
            LOG.debug("Successfully downgraded lock for %(purpose)s "
                      "on node %(node)s",
                      {'purpose': self._purpose, 'node': self.node.uuid})

    def release_resources(self):

        if not self.shared:
            try:
                if self.node:
                    objects.Node.release(self.context, CONF.host, self.node.id)
            except exception.NodeNotFound:
                # squelch the exception if the node was deleted
                # within the task's context.
                pass
        if self.node:
            LOG.debug("Successfully released %(type)s lock for %(purpose)s "
                      "on node %(node)s (lock was held %(time).2f sec)",
                      {'type': 'shared' if self.shared else 'exclusive',
                       'purpose': self._purpose, 'node': self.node.uuid,
                       'time': self._debug_timer.elapsed()})
        self.node = None
        self.driver = None
        self.ports = None
        self.portgroups = None
        self.volume_connectors = None
        self.volume_targets = None
        self.fsm = None

    def _write_exception(self, future):
        """Set node last_error if exception raised in thread."""
        node = self.node
        # do not rewrite existing error
        if node and node.last_error is None:
            method = self._spawn_args[0].__name__
            try:
                exc = future.exception()
            except futurist.CancelledError:
                LOG.exception("Execution of %(method)s for node %(node)s "
                              "was canceled.", {'method': method,
                                                'node': node.uuid})
            else:
                if exc is not None:
                    msg = _("Async execution of %(method)s failed with error: "
                            "%(error)s") % {'method': method,
                                            'error': str(exc)}
                    node.last_error = msg
                    try:
                        node.save()
                    except exception.NodeNotFound:
                        pass

    def _notify_provision_state_change(self):
        """Emit notification about change of the node provision state."""
        if self._event is None:
            return

        if self.node is None:
            # Rare case if resource released before notification
            task = copy.copy(self)
            task.fsm = states.machine.copy()
            task.node = self._saved_node
        else:
            task = self

        node = task.node

        state = node.provision_state
        prev_state = self._prev_provision_state
        new_unstable = state in states.UNSTABLE_STATES
        prev_unstable = prev_state in states.UNSTABLE_STATES
        level = fields.NotificationLevel.INFO

        if self._event in ('fail', 'error'):
            status = fields.NotificationStatus.ERROR
            level = fields.NotificationLevel.ERROR
        elif (prev_unstable, new_unstable) == (False, True):
            status = fields.NotificationStatus.START
        elif (prev_unstable, new_unstable) == (True, False):
            status = fields.NotificationStatus.END
        else:
            status = fields.NotificationStatus.SUCCESS

        notify.emit_provision_set_notification(
            task, level, status, self._prev_provision_state,
            self._prev_target_provision_state, self._event)

        # reset saved event, avoiding duplicate notification
        self._event = None

    def _thread_release_resources(self, fut):
        """Thread callback to release resources."""
        try:
            self._write_exception(fut)
        finally:
            self.release_resources()

    def process_event(self, event, callback=None, call_args=None,
                      call_kwargs=None, err_handler=None, target_state=None,
                      last_error=None):
        # save previous states and event
        self._prev_provision_state = self.node.provision_state
        self._prev_target_provision_state = self.node.target_provision_state
        self._event = event

        if err_handler and callback:
            self.set_spawn_error_hook(err_handler, self.node,
                                      self.node.provision_state,
                                      self.node.target_provision_state)

        self.node.provision_state = self.fsm.current_state

        if not callback and self.fsm.is_stable(self.node.provision_state):
            self.node.target_provision_state = states.NOSTATE
        else:
            self.node.target_provision_state = self.fsm.target_state

        # set up the async worker
        if callback:
            # update the error if we're going to start work in a callback
            self.node.last_error = last_error
            if call_args is None:
                call_args = ()
            if call_kwargs is None:
                call_kwargs = {}
            self.spawn_after(callback, *call_args, **call_kwargs)
        elif last_error is not None:
            self.node.last_error = last_error

        # publish the state transition by saving the Node
        self.node.save()

        log_message = ('Node %(node)s moved to provision state "%(state)s" '
                       'from state "%(previous)s"; target provision state is '
                       '"%(target)s"' %
                       {'node': self.node.uuid,
                        'state': self.node.provision_state,
                        'target': self.node.target_provision_state,
                        'previous': self._prev_provision_state})

        if (self.node.provision_state.endswith('failed')
                or self.node.provision_state == 'error'):
            LOG.error(log_message)
        else:
            LOG.info(log_message)

        if callback is None:
            self._notify_provision_state_change()
        else:
            self._saved_node = self.node

    def resume_cleaning(self):
        """A helper to resume cleaning with the right target state."""
        if self.node.target_provision_state == states.MANAGEABLE:
            target_state = states.MANAGEABLE
        else:
            target_state = None
        self.process_event('resume', target_state=target_state)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type is None and self._spawn_method is not None:
            fut = None
            try:
                fut = self._spawn_method(*self._spawn_args,
                                         **self._spawn_kwargs)

                fut.add_done_callback(self._thread_release_resources)
                self._notify_provision_state_change()
                return
            except Exception as e:
                with excutils.save_and_reraise_exception():
                    try:
                        # Execute the on_error hook if set
                        if self._on_error_method:
                            self._on_error_method(e, *self._on_error_args,
                                                  **self._on_error_kwargs)
                    except Exception:
                        LOG.warning("Task's on_error hook failed to "
                                    "call %(method)s on node %(node)s",
                                    {'method': self._on_error_method.__name__,
                                     'node': self.node.uuid})

                    if fut is not None:
                        # This means the add_done_callback() failed for some
                        # reason. Nuke the thread.
                        fut.cancel()
                    self.release_resources()
        self.release_resources()

最后实例化的结果如下:

# 假设 context 和 node 已经准备好
context = ...  # OpenStack 请求上下文对象
node = Node(uuid='1234-5678-90ab-cdef', driver='ipmi', ...)

# 创建 TaskManager 实例
task = task_manager.TaskManager(
    context=context,
    node=node,
    shared=False,           # 是否为共享锁
    purpose='cleaning'      # 任务用途描述
)

# task 的典型样式如下
print(task)
# TaskManager(
#     context=<RequestContext ...>,
#     node=<Node uuid=1234-5678-90ab-cdef driver=ipmi ...>,
#     driver=<IPMIHardware ...>,
#     shared=False,
#     purpose='cleaning',
#     resources=...,
#     _lock=...,
#     ...
# )