# 安装bootloader
function install_bootloader {
#此处省略很多
...
}
#向Ironic Condutor发送消息,开启socket端口10000等待通知PXE结束
function do_vendor_passthru_and_wait {
local data=$1
local vendor_passthru_name=$2
eval curl -i -X POST \
"$TOKEN_HEADER" \
"-H 'Accept: application/json'" \
"-H 'Content-Type: application/json'" \
-d "$data" \
"$IRONIC_API_URL/v1/nodes/$DEPLOYMENT_ID/vendor_passthru/$vendor_passthru_name"
echo "Waiting for notice of complete"
nc -l -p 10000
}
readonly IRONIC_API_URL=$(get_kernel_parameter ironic_api_url)
readonly IRONIC_BOOT_OPTION=$(get_kernel_parameter boot_option)
readonly IRONIC_BOOT_MODE=$(get_kernel_parameter boot_mode)
readonly ROOT_DEVICE=$(get_kernel_parameter root_device)
if [ -z "$ISCSI_TARGET_IQN" ]; then
err_msg "iscsi_target_iqn is not defined"
troubleshoot
fi
#获取当前linux的本地硬盘
target_disk=
if [[ $ROOT_DEVICE ]]; then
target_disk="$(get_root_device)"
else
t=0
while ! target_disk=$(find_disk "$DISK"); do
if [ $t -eq 60 ]; then
break
fi
t=$(($t + 1))
sleep 1
done
fi
if [ -z "$target_disk" ]; then
err_msg "Could not find disk to use."
troubleshoot
fi
#将找到的本地磁盘作为iSCSI磁盘启动,暴露给Ironic Condutor
echo "start iSCSI target on $target_disk"
start_iscsi_target "$ISCSI_TARGET_IQN" "$target_disk" ALL
if [ $? -ne 0 ]; then
err_msg "Failed to start iscsi target."
troubleshoot
fi
#获取到相关的token文件,从tftp服务器上获取,token文件在ironic在prepare阶段就生成好的。
if [ "$BOOT_METHOD" = "$VMEDIA_BOOT_TAG" ]; then
TOKEN_FILE="$VMEDIA_DIR/token"
if [ -f "$TOKEN_FILE" ]; then
TOKEN_HEADER="-H 'X-Auth-Token: $(cat $TOKEN_FILE)'"
else TOKEN_HEADER=""
fi
else
TOKEN_FILE=token-$DEPLOYMENT_ID
# Allow multiple versions of the tftp client
if tftp -r $TOKEN_FILE -g $BOOT_SERVER || tftp $BOOT_SERVER -c get $TOKEN_FILE; then
TOKEN_HEADER="-H 'X-Auth-Token: $(cat $TOKEN_FILE)'"
else
TOKEN_HEADER=""
fi
fi
#向Ironic请求部署镜像,POST node的/vendor_passthru/pass_deploy_info请求
echo "Requesting Ironic API to deploy image"
deploy_data="'{\"address\":\"$BOOT_IP_ADDRESS\",\"key\":\"$DEPLOYMENT_KEY\",\"iqn\":\"$ISCSI_TARGET_IQN\",\"error\":\"$FIRST_ERR_MSG\"}'"
do_vendor_passthru_and_wait "$deploy_data" "pass_deploy_info"
#部署镜像下载结束,停止iSCSI设备
echo "Stopping iSCSI target on $target_disk"
stop_iscsi_target
#如果是本地启动,安装bootloarder
# If localboot is set, install a bootloader
if [ "$IRONIC_BOOT_OPTION" = "local" ]; then
echo "Installing bootloader"
error_msg=$(install_bootloader)
if [ $? -eq 0 ]; then
status=SUCCEEDED
else
status=FAILED
fi
echo "Requesting Ironic API to complete the deploy"
bootloader_install_data="'{\"address\":\"$BOOT_IP_ADDRESS\",\"status\":\"$status\",\"key\":\"$DEPLOYMENT_KEY\",\"error\":\"$error_msg\"}'"
do_vendor_passthru_and_wait "$bootloader_install_data" "pass_bootloader_install_info"
fi
# 安装bootloader
function install_bootloader {
#此处省略很多
...
}
#向Ironic Condutor发送消息,开启socket端口10000等待通知PXE结束
function do_vendor_passthru_and_wait {
local data=$1
local vendor_passthru_name=$2
eval curl -i -X POST \
"$TOKEN_HEADER" \
"-H 'Accept: application/json'" \
"-H 'Content-Type: application/json'" \
-d "$data" \
"$IRONIC_API_URL/v1/nodes/$DEPLOYMENT_ID/vendor_passthru/$vendor_passthru_name"
echo "Waiting for notice of complete"
nc -l -p 10000
}
readonly IRONIC_API_URL=$(get_kernel_parameter ironic_api_url)
readonly IRONIC_BOOT_OPTION=$(get_kernel_parameter boot_option)
readonly IRONIC_BOOT_MODE=$(get_kernel_parameter boot_mode)
readonly ROOT_DEVICE=$(get_kernel_parameter root_device)
if [ -z "$ISCSI_TARGET_IQN" ]; then
err_msg "iscsi_target_iqn is not defined"
troubleshoot
fi
#获取当前linux的本地硬盘
target_disk=
if [[ $ROOT_DEVICE ]]; then
target_disk="$(get_root_device)"
else
t=0
while ! target_disk=$(find_disk "$DISK"); do
if [ $t -eq 60 ]; then
break
fi
t=$(($t + 1))
sleep 1
done
fi
if [ -z "$target_disk" ]; then
err_msg "Could not find disk to use."
troubleshoot
fi
#将找到的本地磁盘作为iSCSI磁盘启动,暴露给Ironic Condutor
echo "start iSCSI target on $target_disk"
start_iscsi_target "$ISCSI_TARGET_IQN" "$target_disk" ALL
if [ $? -ne 0 ]; then
err_msg "Failed to start iscsi target."
troubleshoot
fi
#获取到相关的token文件,从tftp服务器上获取,token文件在ironic在prepare阶段就生成好的。
if [ "$BOOT_METHOD" = "$VMEDIA_BOOT_TAG" ]; then
TOKEN_FILE="$VMEDIA_DIR/token"
if [ -f "$TOKEN_FILE" ]; then
TOKEN_HEADER="-H 'X-Auth-Token: $(cat $TOKEN_FILE)'"
else TOKEN_HEADER=""
fi
else
TOKEN_FILE=token-$DEPLOYMENT_ID
# Allow multiple versions of the tftp client
if tftp -r $TOKEN_FILE -g $BOOT_SERVER || tftp $BOOT_SERVER -c get $TOKEN_FILE; then
TOKEN_HEADER="-H 'X-Auth-Token: $(cat $TOKEN_FILE)'"
else
TOKEN_HEADER=""
fi
fi
#向Ironic请求部署镜像,POST node的/vendor_passthru/pass_deploy_info请求
echo "Requesting Ironic API to deploy image"
deploy_data="'{\"address\":\"$BOOT_IP_ADDRESS\",\"key\":\"$DEPLOYMENT_KEY\",\"iqn\":\"$ISCSI_TARGET_IQN\",\"error\":\"$FIRST_ERR_MSG\"}'"
do_vendor_passthru_and_wait "$deploy_data" "pass_deploy_info"
#部署镜像下载结束,停止iSCSI设备
echo "Stopping iSCSI target on $target_disk"
stop_iscsi_target
#如果是本地启动,安装bootloarder
# If localboot is set, install a bootloader
if [ "$IRONIC_BOOT_OPTION" = "local" ]; then
echo "Installing bootloader"
error_msg=$(install_bootloader)
if [ $? -eq 0 ]; then
status=SUCCEEDED
else
status=FAILED
fi
echo "Requesting Ironic API to complete the deploy"
bootloader_install_data="'{\"address\":\"$BOOT_IP_ADDRESS\",\"status\":\"$status\",\"key\":\"$DEPLOYMENT_KEY\",\"error\":\"$error_msg\"}'"
do_vendor_passthru_and_wait "$bootloader_install_data" "pass_bootloader_install_info"
fi
LOG.error(_LE('Deploy failed for instance %(instance)s. '
'Error: %(error)s'),
{'instance': node.instance_uuid, 'error': e})
msg = _('Failed to continue iSCSI deployment.')
deploy_utils.set_failed_state(task, msg)
else:
#结束部署,通知ramdisk重启,将物理机设置为ative
finish_deploy(task, kwargs.get('address'))
@base.passthru(['POST'])
@task_manager.require_exclusive_lock
def pass_deploy_info(self, task, **kwargs):
"""Continues the deployment of baremetal node over iSCSI.
This method continues the deployment of the baremetal node over iSCSI
from where the deployment ramdisk has left off.
:param task: a TaskManager instance containing the node to act on.
:param kwargs: kwargs for performing iscsi deployment.
:raises: InvalidState
"""
node = task.node
LOG.warning(_LW("The node %s is using the bash deploy ramdisk for "
"its deployment. This deploy ramdisk has been "
"deprecated. Please use the ironic-python-agent "
"(IPA) ramdisk instead."), node.uuid)
task.process_event('resume') #设置任务状态
LOG.debug('Continuing the deployment on node %s', node.uuid)
is_whole_disk_image = node.driver_internal_info['is_whole_disk_image']
#继续部署的函数,连接到iSCSI设备,将用户镜像写到iSCSI设备上,退出删除iSCSI设备,
#然后在Condutor上删除镜像文件
uuid_dict_returned = continue_deploy(task, **kwargs)
root_uuid_or_disk_id = uuid_dict_returned.get(
'root uuid', uuid_dict_returned.get('disk identifier'))
# save the node's root disk UUID so that another conductor could
# rebuild the PXE config file. Due to a shortcoming in Nova objects,
# we have to assign to node.driver_internal_info so the node knows it
# has changed.
driver_internal_info = node.driver_internal_info
driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id
node.driver_internal_info = driver_internal_info
node.save()
try:
#再一次设置PXE引导,为准备进入用户系统做准备
task.driver.boot.prepare_instance(task)
if deploy_utils.get_boot_option(node) == "local":
if not is_whole_disk_image:
LOG.debug('Installing the bootloader on node %s',
node.uuid)
deploy_utils.notify_ramdisk_to_proceed(kwargs['address'])
task.process_event('wait')
return
except Exception as e:
LOG.error(_LE('Deploy failed for instance %(instance)s. '
'Error: %(error)s'),
{'instance': node.instance_uuid, 'error': e})
msg = _('Failed to continue iSCSI deployment.')
deploy_utils.set_failed_state(task, msg)
else:
#结束部署,通知ramdisk重启,将物理机设置为ative
finish_deploy(task, kwargs.get('address'))
@base.passthru(['POST'])
@task_manager.require_exclusive_lock
def pass_deploy_info(self, task, **kwargs):
"""Continues the deployment of baremetal node over iSCSI.
This method continues the deployment of the baremetal node over iSCSI
from where the deployment ramdisk has left off.
:param task: a TaskManager instance containing the node to act on.
:param kwargs: kwargs for performing iscsi deployment.
:raises: InvalidState
"""
node = task.node
LOG.warning(_LW("The node %s is using the bash deploy ramdisk for "
"its deployment. This deploy ramdisk has been "
"deprecated. Please use the ironic-python-agent "
"(IPA) ramdisk instead."), node.uuid)
task.process_event('resume') #设置任务状态
LOG.debug('Continuing the deployment on node %s', node.uuid)
is_whole_disk_image = node.driver_internal_info['is_whole_disk_image']
#继续部署的函数,连接到iSCSI设备,将用户镜像写到iSCSI设备上,退出删除iSCSI设备,
#然后在Condutor上删除镜像文件
uuid_dict_returned = continue_deploy(task, **kwargs)
root_uuid_or_disk_id = uuid_dict_returned.get(
'root uuid', uuid_dict_returned.get('disk identifier'))
# save the node's root disk UUID so that another conductor could
# rebuild the PXE config file. Due to a shortcoming in Nova objects,
# we have to assign to node.driver_internal_info so the node knows it
# has changed.
driver_internal_info = node.driver_internal_info
driver_internal_info['root_uuid_or_disk_id'] = root_uuid_or_disk_id
node.driver_internal_info = driver_internal_info
node.save()
try:
#再一次设置PXE引导,为准备进入用户系统做准备
task.driver.boot.prepare_instance(task)
if deploy_utils.get_boot_option(node) == "local":
if not is_whole_disk_image:
LOG.debug('Installing the bootloader on node %s',
node.uuid)
deploy_utils.notify_ramdisk_to_proceed(kwargs['address'])
task.process_event('wait')
return
except Exception as e:
LOG.error(_LE('Deploy failed for instance %(instance)s. '
'Error: %(error)s'),
{'instance': node.instance_uuid, 'error': e})
msg = _('Failed to continue iSCSI deployment.')
deploy_utils.set_failed_state(task, msg)
else:
#结束部署,通知ramdisk重启,将物理机设置为ative
finish_deploy(task, kwargs.get('address'))
# Don't start heartbeating until the server is listening
self.heartbeater.start()
try:
wsgi.serve_forever()
except BaseException:
self.log.exception('shutting down')
#部署完成后停止心跳包
if not self.standalone:
self.heartbeater.stop()
def run(self):
"""Run the Ironic Python Agent."""
# Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError
# if there is an issue (uncaught, restart agent)
self.started_at = _time()
#加载hardware manager
# Cached hw managers at runtime, not load time. See bug 1490008.
hardware.load_managers()
if not self.standalone:
# Inspection should be started before call to lookup, otherwise
# lookup will fail due to unknown MAC.
uuid = inspector.inspect()
#利用Ironic API给Condutor发送lookup()请求,用户获取UUID,相当于自发现
content = self.api_client.lookup_node(
hardware_info=hardware.dispatch_to_managers(
'list_hardware_info'),
timeout=self.lookup_timeout,
starting_interval=self.lookup_interval,
node_uuid=uuid)
self.node = content['node']
self.heartbeat_timeout = content['heartbeat_timeout']
wsgi = simple_server.make_server(
self.listen_address[0],
self.listen_address[1],
self.api,
server_class=simple_server.WSGIServer)
#发送心跳包
if not self.standalone:
# Don't start heartbeating until the server is listening
self.heartbeater.start()
try:
wsgi.serve_forever()
except BaseException:
self.log.exception('shutting down')
#部署完成后停止心跳包
if not self.standalone:
self.heartbeater.stop()
def run(self):
"""Run the Ironic Python Agent."""
# Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError
# if there is an issue (uncaught, restart agent)
self.started_at = _time()
#加载hardware manager
# Cached hw managers at runtime, not load time. See bug 1490008.
hardware.load_managers()
if not self.standalone:
# Inspection should be started before call to lookup, otherwise
# lookup will fail due to unknown MAC.
uuid = inspector.inspect()
#利用Ironic API给Condutor发送lookup()请求,用户获取UUID,相当于自发现
content = self.api_client.lookup_node(
hardware_info=hardware.dispatch_to_managers(
'list_hardware_info'),
timeout=self.lookup_timeout,
starting_interval=self.lookup_interval,
node_uuid=uuid)
self.node = content['node']
self.heartbeat_timeout = content['heartbeat_timeout']
wsgi = simple_server.make_server(
self.listen_address[0],
self.listen_address[1],
self.api,
server_class=simple_server.WSGIServer)
#发送心跳包
if not self.standalone:
# Don't start heartbeating until the server is listening
self.heartbeater.start()
try:
wsgi.serve_forever()
except BaseException:
self.log.exception('shutting down')
#部署完成后停止心跳包
if not self.standalone:
self.heartbeater.stop()
@base.passthru(['POST'])
def heartbeat(self, task, **kwargs):
"""Method for agent to periodically check in.
The agent should be sending its agent_url (so Ironic can talk back)
as a kwarg. kwargs should have the following format::
{
'agent_url': 'http://AGENT_HOST:AGENT_PORT'
}
AGENT_PORT defaults to 9999.
"""
node = task.node
driver_internal_info = node.driver_internal_info
LOG.debug(
'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
{'node': node.uuid,
'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
driver_internal_info['agent_last_heartbeat'] = int(_time())
try:
driver_internal_info['agent_url'] = kwargs['agent_url']
except KeyError:
raise exception.MissingParameterValue(_('For heartbeat operation, '
'"agent_url" must be '
'specified.'))
node.driver_internal_info = driver_internal_info
node.save()
# Async call backs don't set error state on their own
# TODO(jimrollenhagen) improve error messages here
msg = _('Failed checking if deploy is done.')
try:
if node.maintenance:
# this shouldn't happen often, but skip the rest if it does.
LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
'not taking any action.', {'node': node.uuid})
return
elif (node.provision_state == states.DEPLOYWAIT and
not self.deploy_has_started(task)):
msg = _('Node failed to get image for deploy.')
self.continue_deploy(task, **kwargs) #调用continue_deploy函数,下载镜像
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_is_done(task)): #查看IPA执行下载镜像是否结束
msg = _('Node failed to move to active state.')
self.reboot_to_instance(task, **kwargs) #如果镜像已经下载完成,即部署完成,设置从disk启动,重启进入用户系统,
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_has_started(task)):
node.touch_provisioning() #更新数据库,将节点的设置为alive
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the Mitaka
# release starts.
elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
node.touch_provisioning()
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
msg = _('Node failed to start the next cleaning step.')
manager.set_node_cleaning_steps(task)
self._notify_conductor_resume_clean(task)
else:
msg = _('Node failed to check cleaning progress.')
self.continue_cleaning(task, **kwargs)
except Exception as e:
err_info = {'node': node.uuid, 'msg': msg, 'e': e}
last_error = _('Asynchronous exception for node %(node)s: '
'%(msg)s exception: %(e)s') % err_info
LOG.exception(last_error)
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
manager.cleaning_error_handler(task, last_error)
elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
deploy_utils.set_failed_state(task, last_error)
@base.passthru(['POST'])
def heartbeat(self, task, **kwargs):
"""Method for agent to periodically check in.
The agent should be sending its agent_url (so Ironic can talk back)
as a kwarg. kwargs should have the following format::
{
'agent_url': 'http://AGENT_HOST:AGENT_PORT'
}
AGENT_PORT defaults to 9999.
"""
node = task.node
driver_internal_info = node.driver_internal_info
LOG.debug(
'Heartbeat from %(node)s, last heartbeat at %(heartbeat)s.',
{'node': node.uuid,
'heartbeat': driver_internal_info.get('agent_last_heartbeat')})
driver_internal_info['agent_last_heartbeat'] = int(_time())
try:
driver_internal_info['agent_url'] = kwargs['agent_url']
except KeyError:
raise exception.MissingParameterValue(_('For heartbeat operation, '
'"agent_url" must be '
'specified.'))
node.driver_internal_info = driver_internal_info
node.save()
# Async call backs don't set error state on their own
# TODO(jimrollenhagen) improve error messages here
msg = _('Failed checking if deploy is done.')
try:
if node.maintenance:
# this shouldn't happen often, but skip the rest if it does.
LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
'not taking any action.', {'node': node.uuid})
return
elif (node.provision_state == states.DEPLOYWAIT and
not self.deploy_has_started(task)):
msg = _('Node failed to get image for deploy.')
self.continue_deploy(task, **kwargs) #调用continue_deploy函数,下载镜像
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_is_done(task)): #查看IPA执行下载镜像是否结束
msg = _('Node failed to move to active state.')
self.reboot_to_instance(task, **kwargs) #如果镜像已经下载完成,即部署完成,设置从disk启动,重启进入用户系统,
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_has_started(task)):
node.touch_provisioning() #更新数据库,将节点的设置为alive
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the Mitaka
# release starts.
elif node.provision_state in (states.CLEANWAIT, states.CLEANING):
node.touch_provisioning()
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
msg = _('Node failed to start the next cleaning step.')
manager.set_node_cleaning_steps(task)
self._notify_conductor_resume_clean(task)
else:
msg = _('Node failed to check cleaning progress.')
self.continue_cleaning(task, **kwargs)
except Exception as e:
err_info = {'node': node.uuid, 'msg': msg, 'e': e}
last_error = _('Asynchronous exception for node %(node)s: '
'%(msg)s exception: %(e)s') % err_info
LOG.exception(last_error)
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
manager.cleaning_error_handler(task, last_error)
elif node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
deploy_utils.set_failed_state(task, last_error)
# Tell the client to download and write the image with the given args
self._client.prepare_image(node, image_info)
task.process_event('wait')
@task_manager.require_exclusive_lock
def continue_deploy(self, task, **kwargs):
task.process_event('resume')
node = task.node
image_source = node.instance_info.get('image_source')
LOG.debug('Continuing deploy for node %(node)s with image %(img)s',
{'node': node.uuid, 'img': image_source})
image_info = {
'id': image_source.split('/')[-1],
'urls': [node.instance_info['image_url']],
'checksum': node.instance_info['image_checksum'],
# NOTE(comstud): Older versions of ironic do not set
# 'disk_format' nor 'container_format', so we use .get()
# to maintain backwards compatibility in case code was
# upgraded in the middle of a build request.
'disk_format': node.instance_info.get('image_disk_format'),
'container_format': node.instance_info.get(
'image_container_format')
}
#通知IPA下载swift上的镜像,并写入本地磁盘
# Tell the client to download and write the image with the given args
self._client.prepare_image(node, image_info)
task.process_event('wait')
@task_manager.require_exclusive_lock
def continue_deploy(self, task, **kwargs):
task.process_event('resume')
node = task.node
image_source = node.instance_info.get('image_source')
LOG.debug('Continuing deploy for node %(node)s with image %(img)s',
{'node': node.uuid, 'img': image_source})
image_info = {
'id': image_source.split('/')[-1],
'urls': [node.instance_info['image_url']],
'checksum': node.instance_info['image_checksum'],
# NOTE(comstud): Older versions of ironic do not set
# 'disk_format' nor 'container_format', so we use .get()
# to maintain backwards compatibility in case code was
# upgraded in the middle of a build request.
'disk_format': node.instance_info.get('image_disk_format'),
'container_format': node.instance_info.get(
'image_container_format')
}
#通知IPA下载swift上的镜像,并写入本地磁盘
# Tell the client to download and write the image with the given args
self._client.prepare_image(node, image_info)
task.process_event('wait')