TaskManager 介绍
TaskManager 类是 Ironic 中用于管理节点操作的核心类。它提供了一个上下文管理器,用于获取和释放节点锁,加载驱动程序,以及执行节点操作。以下是对 TaskManager 类及其主要方法的详细解析
class TaskManager(object):
"""Context manager for tasks.
This class wraps the locking, driver loading, and acquisition
of related resources (eg, Node and Ports) when beginning a unit of work.
"""
def __init__(self, context, node_id, shared=False,
purpose='unspecified action', retry=True, patient=False,
load_driver=True):
self._spawn_method = None
self._on_error_method = None
self.context = context
self._node = None
self._ports = None
self._portgroups = None
self._volume_connectors = None
self._volume_targets = None
self.node_id = node_id
self.shared = shared
self._retry = retry
self._patient = patient
self.fsm = states.machine.copy()
self._purpose = purpose
self._debug_timer = timeutils.StopWatch()
# states and event for notification
self._prev_provision_state = None
self._prev_target_provision_state = None
self._event = None
self._saved_node = None
try:
node = objects.Node.get(context, node_id)
LOG.debug("Attempting to get %(type)s lock on node %(node)s (for "
"%(purpose)s)",
{'type': 'shared' if shared else 'exclusive',
'node': node.uuid, 'purpose': purpose})
if not self.shared:
self._lock()
else:
self._debug_timer.restart()
self.node = node
if load_driver:
self.driver = driver_factory.build_driver_for_task(self)
else:
self.driver = None
except Exception:
with excutils.save_and_reraise_exception():
self.release_resources()
@property
def node(self):
return self._node
@node.setter
def node(self, node):
self._node = node
if node is not None:
self.fsm.initialize(start_state=self.node.provision_state,
target_state=self.node.target_provision_state)
@property
def ports(self):
try:
if self._ports is None:
self._ports = objects.Port.list_by_node_id(self.context,
self.node.id)
except Exception:
with excutils.save_and_reraise_exception():
self.release_resources()
return self._ports
@ports.setter
def ports(self, ports):
self._ports = ports
@property
def portgroups(self):
try:
if self._portgroups is None:
self._portgroups = objects.Portgroup.list_by_node_id(
self.context, self.node.id)
except Exception:
with excutils.save_and_reraise_exception():
self.release_resources()
return self._portgroups
@portgroups.setter
def portgroups(self, portgroups):
self._portgroups = portgroups
@property
def volume_connectors(self):
try:
if self._volume_connectors is None:
self._volume_connectors = \
objects.VolumeConnector.list_by_node_id(
self.context, self.node.id)
except Exception:
with excutils.save_and_reraise_exception():
self.release_resources()
return self._volume_connectors
@volume_connectors.setter
def volume_connectors(self, volume_connectors):
self._volume_connectors = volume_connectors
@property
def volume_targets(self):
try:
if self._volume_targets is None:
self._volume_targets = objects.VolumeTarget.list_by_node_id(
self.context, self.node.id)
except Exception:
with excutils.save_and_reraise_exception():
self.release_resources()
return self._volume_targets
@volume_targets.setter
def volume_targets(self, volume_targets):
self._volume_targets = volume_targets
def load_driver(self):
if self.driver is None:
self.driver = driver_factory.build_driver_for_task(self)
def _lock(self):
self._debug_timer.restart()
if self._patient:
stop_after = tenacity.stop_never
elif self._retry:
stop_after = tenacity.stop_after_attempt(
CONF.conductor.node_locked_retry_attempts)
else:
stop_after = tenacity.stop_after_attempt(1)
max_lock_time = \
CONF.conductor.node_locked_retry_interval * \
CONF.conductor.node_locked_retry_attempts
@tenacity.retry(
retry=tenacity.retry_if_exception_type(exception.NodeLocked),
stop=stop_after,
wait=tenacity.wait_fixed(
CONF.conductor.node_locked_retry_interval),
reraise=True)
def reserve_node():
if self._debug_timer.elapsed() > max_lock_time:
LOG.warning('We have exceeded the normal maximum time window '
'to complete a node lock attempting to reserve '
'node %(node)s for purpose %(purpose)s. At '
'%(time).2f seconds.',
{'node': self.node_id, 'purpose': self._purpose,
'time': self._debug_timer.elapsed()})
self.node = objects.Node.reserve(self.context, CONF.host,
self.node_id)
LOG.debug("Node %(node)s successfully reserved for %(purpose)s "
"(took %(time).2f seconds)",
{'node': self.node.uuid, 'purpose': self._purpose,
'time': self._debug_timer.elapsed()})
self._debug_timer.restart()
reserve_node()
def upgrade_lock(self, purpose=None, retry=None):
if purpose is not None:
self._purpose = purpose
if retry is not None:
self._retry = retry
if self.shared:
LOG.debug('Upgrading shared lock on node %(uuid)s for %(purpose)s '
'to an exclusive one (shared lock was held %(time).2f '
'seconds)',
{'uuid': self.node.uuid, 'purpose': self._purpose,
'time': self._debug_timer.elapsed()})
self._lock()
self.shared = False
def spawn_after(self, _spawn_method, *args, **kwargs):
self._spawn_method = _spawn_method
self._spawn_args = args
self._spawn_kwargs = kwargs
def set_spawn_error_hook(self, _on_error_method, *args, **kwargs):
self._on_error_method = _on_error_method
self._on_error_args = args
self._on_error_kwargs = kwargs
def downgrade_lock(self):
"""Downgrade the lock to a shared one."""
if self.node is None:
raise RuntimeError("Cannot downgrade an already released lock")
if not self.shared:
objects.Node.release(self.context, CONF.host, self.node.id)
self.shared = True
self.node.refresh()
LOG.debug("Successfully downgraded lock for %(purpose)s "
"on node %(node)s",
{'purpose': self._purpose, 'node': self.node.uuid})
def release_resources(self):
if not self.shared:
try:
if self.node:
objects.Node.release(self.context, CONF.host, self.node.id)
except exception.NodeNotFound:
# squelch the exception if the node was deleted
# within the task's context.
pass
if self.node:
LOG.debug("Successfully released %(type)s lock for %(purpose)s "
"on node %(node)s (lock was held %(time).2f sec)",
{'type': 'shared' if self.shared else 'exclusive',
'purpose': self._purpose, 'node': self.node.uuid,
'time': self._debug_timer.elapsed()})
self.node = None
self.driver = None
self.ports = None
self.portgroups = None
self.volume_connectors = None
self.volume_targets = None
self.fsm = None
def _write_exception(self, future):
"""Set node last_error if exception raised in thread."""
node = self.node
# do not rewrite existing error
if node and node.last_error is None:
method = self._spawn_args[0].__name__
try:
exc = future.exception()
except futurist.CancelledError:
LOG.exception("Execution of %(method)s for node %(node)s "
"was canceled.", {'method': method,
'node': node.uuid})
else:
if exc is not None:
msg = _("Async execution of %(method)s failed with error: "
"%(error)s") % {'method': method,
'error': str(exc)}
node.last_error = msg
try:
node.save()
except exception.NodeNotFound:
pass
def _notify_provision_state_change(self):
"""Emit notification about change of the node provision state."""
if self._event is None:
return
if self.node is None:
# Rare case if resource released before notification
task = copy.copy(self)
task.fsm = states.machine.copy()
task.node = self._saved_node
else:
task = self
node = task.node
state = node.provision_state
prev_state = self._prev_provision_state
new_unstable = state in states.UNSTABLE_STATES
prev_unstable = prev_state in states.UNSTABLE_STATES
level = fields.NotificationLevel.INFO
if self._event in ('fail', 'error'):
status = fields.NotificationStatus.ERROR
level = fields.NotificationLevel.ERROR
elif (prev_unstable, new_unstable) == (False, True):
status = fields.NotificationStatus.START
elif (prev_unstable, new_unstable) == (True, False):
status = fields.NotificationStatus.END
else:
status = fields.NotificationStatus.SUCCESS
notify.emit_provision_set_notification(
task, level, status, self._prev_provision_state,
self._prev_target_provision_state, self._event)
# reset saved event, avoiding duplicate notification
self._event = None
def _thread_release_resources(self, fut):
"""Thread callback to release resources."""
try:
self._write_exception(fut)
finally:
self.release_resources()
def process_event(self, event, callback=None, call_args=None,
call_kwargs=None, err_handler=None, target_state=None,
last_error=None):
# save previous states and event
self._prev_provision_state = self.node.provision_state
self._prev_target_provision_state = self.node.target_provision_state
self._event = event
if err_handler and callback:
self.set_spawn_error_hook(err_handler, self.node,
self.node.provision_state,
self.node.target_provision_state)
self.node.provision_state = self.fsm.current_state
if not callback and self.fsm.is_stable(self.node.provision_state):
self.node.target_provision_state = states.NOSTATE
else:
self.node.target_provision_state = self.fsm.target_state
# set up the async worker
if callback:
# update the error if we're going to start work in a callback
self.node.last_error = last_error
if call_args is None:
call_args = ()
if call_kwargs is None:
call_kwargs = {}
self.spawn_after(callback, *call_args, **call_kwargs)
elif last_error is not None:
self.node.last_error = last_error
# publish the state transition by saving the Node
self.node.save()
log_message = ('Node %(node)s moved to provision state "%(state)s" '
'from state "%(previous)s"; target provision state is '
'"%(target)s"' %
{'node': self.node.uuid,
'state': self.node.provision_state,
'target': self.node.target_provision_state,
'previous': self._prev_provision_state})
if (self.node.provision_state.endswith('failed')
or self.node.provision_state == 'error'):
LOG.error(log_message)
else:
LOG.info(log_message)
if callback is None:
self._notify_provision_state_change()
else:
self._saved_node = self.node
def resume_cleaning(self):
"""A helper to resume cleaning with the right target state."""
if self.node.target_provision_state == states.MANAGEABLE:
target_state = states.MANAGEABLE
else:
target_state = None
self.process_event('resume', target_state=target_state)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is None and self._spawn_method is not None:
fut = None
try:
fut = self._spawn_method(*self._spawn_args,
**self._spawn_kwargs)
fut.add_done_callback(self._thread_release_resources)
self._notify_provision_state_change()
return
except Exception as e:
with excutils.save_and_reraise_exception():
try:
# Execute the on_error hook if set
if self._on_error_method:
self._on_error_method(e, *self._on_error_args,
**self._on_error_kwargs)
except Exception:
LOG.warning("Task's on_error hook failed to "
"call %(method)s on node %(node)s",
{'method': self._on_error_method.__name__,
'node': self.node.uuid})
if fut is not None:
# This means the add_done_callback() failed for some
# reason. Nuke the thread.
fut.cancel()
self.release_resources()
self.release_resources()
最后实例化的结果如下:
# 假设 context 和 node 已经准备好
context = ... # OpenStack 请求上下文对象
node = Node(uuid='1234-5678-90ab-cdef', driver='ipmi', ...)
# 创建 TaskManager 实例
task = task_manager.TaskManager(
context=context,
node=node,
shared=False, # 是否为共享锁
purpose='cleaning' # 任务用途描述
)
# task 的典型样式如下
print(task)
# TaskManager(
# context=<RequestContext ...>,
# node=<Node uuid=1234-5678-90ab-cdef driver=ipmi ...>,
# driver=<IPMIHardware ...>,
# shared=False,
# purpose='cleaning',
# resources=...,
# _lock=...,
# ...
# )