# ---LICENSE-BEGIN - DO NOT CHANGE OR MOVE THIS HEADER
# This file is part of the Neurorobotics Platform software
# Copyright (C) 2014,2015,2016,2017 Human Brain Project
# https://www.humanbrainproject.eu
#
# The Human Brain Project is a European Commission funded project
# in the frame of the Horizon2020 FET Flagship plan.
# http://ec.europa.eu/programmes/horizon2020/en/h2020-section/fet-flagships
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# ---LICENSE-END
#!/usr/bin/env python
"""
This module contains the classes needed to have all gazebo services running on the Lugano viz
cluster.
"""
from builtins import range
import pexpect
import time
import logging
import os
import sys
import netifaces
from hbp_nrp_commons.cluster.LuganoVizCluster import LuganoVizCluster, notificator, logger
from hbp_nrp_cleserver.server.GazeboInterface import IGazeboServerInstance
from hbp_nrp_watchdog.WatchdogServer import WatchdogClient
[docs]class LuganoVizClusterGazebo(LuganoVizCluster, IGazeboServerInstance):
"""
Represents an instance of gzserver running on the Lugano viz cluster.
There is a wide usage of pexpect in this class because of the remote connections via
ssh needed to access the Lugano machines. Every time a remote command is launched,
expect() is used to match strings with the shell output in order to understand its status.
"""
DEFAULT_GZSERVER_PORT = 11345
GAZEBO_PROCESSES = 4
GAZEBO_GPUS = 1
VGLCONNECT_CMD = 'vglconnect bbpnrsoa@{node}.cscs.ch -M -K'
def __init__(self, timezone=None, reservation=None):
# direct first parent is LuganoVizCluster, reuse the CLE user notifier logger
super(LuganoVizClusterGazebo, self).__init__(LuganoVizClusterGazebo.GAZEBO_PROCESSES,
LuganoVizClusterGazebo.GAZEBO_GPUS,
timezone, reservation)
# secondary parent is the IGazeboServerInstance interface
IGazeboServerInstance.__init__(self)
# local gazebo specific allocation information
self._x_server_process = None
self._remote_xvnc_process = None
self._gazebo_remote_process = None
self._remote_display_port = -1
self._watchdog_client = None
def _start_fake_X(self):
"""
Start an in memory graphical server. Xvfb or X virtual framebuffer is a display server
implementing the X11 display server protocol. In contrast to other display servers Xvfb
performs all graphical operations in memory without showing any screen output. The goal
is to be able to use vglconnect from the local node to the remote viz cluster node. For
that, we do need an XServer.
Xvfb startup should generally produce no output, but handle the following cases:
- server is already running (e.g. started by root and could not be killed, valid)
- server needs to initialize GPU/hardware extensions (valid but strange configuration)
- no output (expected, handled by short timeout with no output)
- failure to launch (EOF when process reports failure, invalid so abort)
"""
notificator.info('Starting backend graphics server')
self._x_server_process = pexpect.spawn('Xvfb :1', logfile=logger)
result = self._x_server_process.expect(
[
'Server is already active for display',
'Initializing built-in extension',
pexpect.TIMEOUT, # no output (expected)
pexpect.EOF
], # crash/failed launch
self.SMALL_TIMEOUT
)
if result == 3:
raise XvfbXvnError("Cannot start Xvfb")
def _spawn_vglconnect(self):
"""
Return a pexpect object connected to the allocated viz cluster node.
"""
if self._node is None or self._allocation_process is None:
raise Exception("Cannot connect to a cluster node without a proper Job allocation.")
# Ensure $HOME is valid otherwise vglconnect will fail to write session data, this depends
# on if we are launching from a backend VM or from another cluster node via SSH session
env = dict(os.environ, DISPLAY=':1')
if not os.path.exists(os.environ.get('HOME')):
env['HOME'] = '/gpfs/bbp.cscs.ch/home/bbpnrsoa'
# Launch a clean bash session and vglconnect with reused ssh authentication
vglconnect_process = pexpect.spawn('bash', env=env, logfile=logger)
vglconnect_process.sendline(self.VGLCONNECT_CMD.format(node=self._node))
# We do expect a prompt here
result = vglconnect_process.expect([r'\[bbpnrsoa@' + self._node + r'\ ~\]\$',
'password',
pexpect.TIMEOUT])
if result == 1:
raise Exception("Viz cluster node can't be used without password.")
if result == 2:
raise Exception("Cannot connect to node.")
# Always load virtualgl on vgl connections to be able to use a 3D display
vglconnect_process.sendline('module load virtualgl')
vglconnect_process.sendline('export VGL_FORCEALPHA=1') # force 32-bit buffers
return vglconnect_process # This object has to live until the end.
def _start_xvnc(self):
"""
Start a remote Xvnc server. This is the only (known to us) way to have Gazebo using
the graphic card.
"""
if self._node is None or self._allocation_process is None:
raise Exception("Cannot connect to a cluster node without a proper Job allocation.")
notificator.info('Starting cluster node graphics server')
self._remote_xvnc_process = self._spawn_vglconnect()
# Cleanup any leftover Xvnc sessions from our user on this cluster node from failed
# sessions (e.g. crashes or other rare network issues). This won't impact other cluster
# node users or their running Xvnc sessions.
self._remote_xvnc_process.sendline('killall -9 Xvnc')
# Find the first available Xvnc port to use, we are not the only cluster user so we cannot
# guarantee that no other Xvnc is running on a port or that we have access to running
# sessions. Ohter users may also spawn instances arbitrarily, this ensures a valid session.
for p in range(10, 100):
self._remote_xvnc_process.sendline('Xvnc :' + str(p))
result = self._remote_xvnc_process.expect([
'created VNC server for screen 0',
'Server is already active for display',
'server already running',
pexpect.TIMEOUT], self.TIMEOUT)
# valid Xvnc session spawned on port, stop searching
if result == 0:
self._remote_display_port = p
return
# timeout while trying to start Xvnc, abort
elif result == 3:
raise XvfbXvnError("Cannot start Xvnc, unknown error.")
# unable to find an open Xvnc port (very unlikely), abort
raise XvfbXvnError("Cannot start Xvnc, no open display ports.")
def _start_gazebo(self, ros_master_uri, models_path=None, gzserver_args=None):
"""
Start gazebo on the remote server
"""
if self._node is None or self._allocation_process is None:
raise Exception("Cannot connect to a cluster node without a proper Job allocation.")
if self._remote_display_port == -1:
raise Exception("Gazebo needs a remote X Server running")
notificator.info('Configuring the cluster node environment')
self._gazebo_remote_process = self._spawn_vglconnect()
# Kill any active gzservers (this should never happen).
self._gazebo_remote_process.sendline('killall -9 gzserver')
# source environment modules init file
self._gazebo_remote_process.sendline('source /opt/rh/python27/enable')
self._gazebo_remote_process.sendline('source /usr/share/Modules/init/bash 2> /dev/null')
# configure environment variables and get project path
proj_path = self._configure_environment(self._gazebo_remote_process)
# loading the environment modules configuration files
modules_path = proj_path + 'server-scripts/nrp-services-modules.sh'
self._gazebo_remote_process.sendline('source %s' % modules_path)
result = self._gazebo_remote_process.expect(['NRP modules loaded.',
pexpect.TIMEOUT])
if result == 1:
raise Exception("Error while configuring cluster node, gpfs may not be mounted."
+ str(self._gazebo_remote_process.after))
# configure variables after all module loads (that could overwrite values)
self._gazebo_remote_process.sendline('export DISPLAY=:' + str(self._remote_display_port))
self._gazebo_remote_process.sendline('export ROS_MASTER_URI=' + ros_master_uri)
# Use the appropriate dev or staging models based on this backend version
self._gazebo_remote_process.sendline('export GAZEBO_MODEL_PATH=%s/models' % proj_path)
# disable online (unreachable) model searching, only use local NRP models
self._gazebo_remote_process.sendline('export GAZEBO_MODEL_DATABASE_URI=')
# copy robot, if needed
if models_path is not None:
notificator.info("Copy robot to remote server")
self._copy_to_remote(models_path)
self._gazebo_remote_process.sendline('export GAZEBO_MODELS_PATH='
'{trg}:$GAZEBO_MODELS_PATH'
.format(trg=self._tmp_dir))
# launch the watchdog inside the NRP virtualenv
self._gazebo_remote_process.sendline('source %s/platform_venv/bin/activate' % proj_path)
self._gazebo_remote_process.sendline('sleep 10 && python '
'-m hbp_nrp_watchdog.WatchdogServer -n Watchdog '
'-p gzserver -t /gazebo/health &')
self._gazebo_remote_process.sendline('export WATCHDOG_PID=$!')
self._gazebo_remote_process.sendline('deactivate')
# activate ROS python venv to launch Gazebo
self._gazebo_remote_process.sendline('source $ROS_PYTHON_VENV/bin/activate')
# configure command line arguments for gzserver if provided
if gzserver_args is not None:
self._gazebo_remote_process.sendline('export GZSERVER_ARGS="%s"' % gzserver_args)
# launch Gazebo with virtualgl, use -nodl to redirect native opengl calls to virtualgl
notificator.info('Starting Gazebo server on the cluster node')
lib_dir_env = "$ROS_HBP_PACKAGES_LIB_DIR"
self._gazebo_remote_process.sendline('vglrun -nodl $GAZEBO_BIN_DIR/gzserver '
'$GZSERVER_ARGS '
'--pause '
'-s <lib_env>/libgazebo_ros_api_plugin.so '
'-s <lib_env>/libgazebo_ros_paths_plugin.so '
'-s <lib_env>/libgazebo_ros_recording_plugin.so '
'-s <lib_env>/libgazebo_ros_playback_plugin.so '
'--verbose'.replace("<lib_env>", lib_dir_env))
result = self._gazebo_remote_process.expect(['Gazebo multi-robot simulator',
pexpect.TIMEOUT])
if result == 1:
raise Exception("Error while starting gazebo: %s"
% str(self._gazebo_remote_process.after))
def _start_watchdog_client(self):
"""
Starts the watchdog client
"""
self._watchdog_client = WatchdogClient("/gazebo/health", self._raise_gazebo_died)
# Delay starting the watchdog client by 10s (because we delay the start of the watchdog 10s)
self._watchdog_client.start(delay=10)
[docs] def start(self, ros_master_uri, models_path=None, gzserver_args=None):
"""
Start gzserver on the Lugano viz cluster
"""
try:
self._allocate_job(reuse_nodes=False) # only one gzserver per cluster node
self._start_fake_X()
self._start_xvnc()
self._start_gazebo(ros_master_uri, models_path, gzserver_args)
self._start_watchdog_client()
# pylint: disable=broad-except
except Exception:
logger.exception('Failure launching gzserver on remote node.')
# always cleanup, but only raise the start exception up, not any shutdown errors
try:
self.stop()
# pylint: disable=broad-except
except Exception:
pass
raise
[docs] def try_extend(self, new_timeout):
""""
Verifies that the gazebo can accept the new simulation timeout
Returns whether the timeout is accepted
"""
return self._allocation_time is None or new_timeout <= self._allocation_time
@property
def gazebo_master_uri(self):
"""
Returns a string containing the gzserver master
URI (like:'http://bbpviz001.cscs.ch:11345')
"""
if self._node is not None:
return ('http://' + self._node + self.NODE_DOMAIN + ':'
+ str(self.DEFAULT_GZSERVER_PORT))
else:
return None
[docs] def stop(self):
# stop the local watchdog client before terminating the remote side
if self._watchdog_client is not None:
self._watchdog_client.stop()
self._watchdog_client = None
# cluster node cleanup (this can fail, but make sure we always release the job below)
try:
# terminate running remote watchdog, gzserver, and invoking bash shell
if self._gazebo_remote_process:
notificator.info('Stopping Gazebo server on the cluster node')
self._gazebo_remote_process.sendcontrol('z')
self._gazebo_remote_process.sendline('kill -v -n 9 $WATCHDOG_PID')
self._gazebo_remote_process.sendline('killall -v -9 gzserver')
self._gazebo_remote_process.expect([pexpect.TIMEOUT,
'Killed',
'gzserver: no process killed'], self.TIMEOUT)
self._gazebo_remote_process.terminate()
# directly terminate Xvnc process (not invoked via bash)
if self._remote_xvnc_process:
notificator.info('Stopping cluster node graphics server')
self._remote_xvnc_process.terminate()
# pylint: disable=broad-except
except Exception:
logger.exception('Error cleaning up cluster node.')
finally:
self._gazebo_remote_process = None
self._remote_xvnc_process = None
self._remote_display_port = -1
# SLURM cleanup and temporary folder deletion (must happen after any cluster cleanup as this
# will deallocate the process)
LuganoVizCluster.stop(self)
# cleserver cleanup Xvfb, this is not critical
if self._x_server_process:
notificator.info('Stopping backend graphics server')
self._x_server_process.terminate()
self._x_server_process = None
[docs] def restart(self, ros_master_uri):
notificator.info("Restarting Gazebo server on the cluster")
self.stop()
self.start(ros_master_uri)
[docs]class XvfbXvnError(Exception):
"""
This exception class is a marker for errors coming from Xvfb or Xvn.
"""
pass
def _get_roscore_master_uri():
"""
Return roscore master URI. If the env variable ROS_MASTER_URI is not set,
then construct it like this: http:// + local_ip + :11311
"""
master_uri = os.environ.get("ROS_MASTER_URI")
if not master_uri:
local_ip = netifaces.ifaddresses('eth0')[netifaces.AF_INET][0]['addr']
master_uri = 'http://' + local_ip + ':11311'
return master_uri
# Useful to test out the code.
if __name__ == '__main__': # pragma: no cover
logger.setLevel(logging.DEBUG)
log_format = '%(asctime)s [%(threadName)-12.12s] [%(name)-12.12s] [%(levelname)s] %(message)s'
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(logging.Formatter(log_format))
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)
gazebo = LuganoVizClusterGazebo()
gazebo.start(_get_roscore_master_uri())
time.sleep(100)
gazebo.stop()