Source code for hbp_nrp_cleserver.server.LuganoVizClusterGazebo

# ---LICENSE-BEGIN - DO NOT CHANGE OR MOVE THIS HEADER
# This file is part of the Neurorobotics Platform software
# Copyright (C) 2014,2015,2016,2017 Human Brain Project
# https://www.humanbrainproject.eu
#
# The Human Brain Project is a European Commission funded project
# in the frame of the Horizon2020 FET Flagship plan.
# http://ec.europa.eu/programmes/horizon2020/en/h2020-section/fet-flagships
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# ---LICENSE-END
#!/usr/bin/env python
"""
This module contains the classes needed to have all gazebo services running on the Lugano viz
cluster.
"""

from builtins import range

import pexpect
import time
import logging
import os
import sys
import netifaces

from hbp_nrp_commons.cluster.LuganoVizCluster import LuganoVizCluster, notificator, logger
from hbp_nrp_cleserver.server.GazeboInterface import IGazeboServerInstance
from hbp_nrp_watchdog.WatchdogServer import WatchdogClient


[docs]class LuganoVizClusterGazebo(LuganoVizCluster, IGazeboServerInstance): """ Represents an instance of gzserver running on the Lugano viz cluster. There is a wide usage of pexpect in this class because of the remote connections via ssh needed to access the Lugano machines. Every time a remote command is launched, expect() is used to match strings with the shell output in order to understand its status. """ DEFAULT_GZSERVER_PORT = 11345 GAZEBO_PROCESSES = 4 GAZEBO_GPUS = 1 VGLCONNECT_CMD = 'vglconnect bbpnrsoa@{node}.cscs.ch -M -K' def __init__(self, timezone=None, reservation=None): # direct first parent is LuganoVizCluster, reuse the CLE user notifier logger super(LuganoVizClusterGazebo, self).__init__(LuganoVizClusterGazebo.GAZEBO_PROCESSES, LuganoVizClusterGazebo.GAZEBO_GPUS, timezone, reservation) # secondary parent is the IGazeboServerInstance interface IGazeboServerInstance.__init__(self) # local gazebo specific allocation information self._x_server_process = None self._remote_xvnc_process = None self._gazebo_remote_process = None self._remote_display_port = -1 self._watchdog_client = None def _start_fake_X(self): """ Start an in memory graphical server. Xvfb or X virtual framebuffer is a display server implementing the X11 display server protocol. In contrast to other display servers Xvfb performs all graphical operations in memory without showing any screen output. The goal is to be able to use vglconnect from the local node to the remote viz cluster node. For that, we do need an XServer. Xvfb startup should generally produce no output, but handle the following cases: - server is already running (e.g. started by root and could not be killed, valid) - server needs to initialize GPU/hardware extensions (valid but strange configuration) - no output (expected, handled by short timeout with no output) - failure to launch (EOF when process reports failure, invalid so abort) """ notificator.info('Starting backend graphics server') self._x_server_process = pexpect.spawn('Xvfb :1', logfile=logger) result = self._x_server_process.expect( [ 'Server is already active for display', 'Initializing built-in extension', pexpect.TIMEOUT, # no output (expected) pexpect.EOF ], # crash/failed launch self.SMALL_TIMEOUT ) if result == 3: raise XvfbXvnError("Cannot start Xvfb") def _spawn_vglconnect(self): """ Return a pexpect object connected to the allocated viz cluster node. """ if self._node is None or self._allocation_process is None: raise Exception("Cannot connect to a cluster node without a proper Job allocation.") # Ensure $HOME is valid otherwise vglconnect will fail to write session data, this depends # on if we are launching from a backend VM or from another cluster node via SSH session env = dict(os.environ, DISPLAY=':1') if not os.path.exists(os.environ.get('HOME')): env['HOME'] = '/gpfs/bbp.cscs.ch/home/bbpnrsoa' # Launch a clean bash session and vglconnect with reused ssh authentication vglconnect_process = pexpect.spawn('bash', env=env, logfile=logger) vglconnect_process.sendline(self.VGLCONNECT_CMD.format(node=self._node)) # We do expect a prompt here result = vglconnect_process.expect([r'\[bbpnrsoa@' + self._node + r'\ ~\]\$', 'password', pexpect.TIMEOUT]) if result == 1: raise Exception("Viz cluster node can't be used without password.") if result == 2: raise Exception("Cannot connect to node.") # Always load virtualgl on vgl connections to be able to use a 3D display vglconnect_process.sendline('module load virtualgl') vglconnect_process.sendline('export VGL_FORCEALPHA=1') # force 32-bit buffers return vglconnect_process # This object has to live until the end. def _start_xvnc(self): """ Start a remote Xvnc server. This is the only (known to us) way to have Gazebo using the graphic card. """ if self._node is None or self._allocation_process is None: raise Exception("Cannot connect to a cluster node without a proper Job allocation.") notificator.info('Starting cluster node graphics server') self._remote_xvnc_process = self._spawn_vglconnect() # Cleanup any leftover Xvnc sessions from our user on this cluster node from failed # sessions (e.g. crashes or other rare network issues). This won't impact other cluster # node users or their running Xvnc sessions. self._remote_xvnc_process.sendline('killall -9 Xvnc') # Find the first available Xvnc port to use, we are not the only cluster user so we cannot # guarantee that no other Xvnc is running on a port or that we have access to running # sessions. Ohter users may also spawn instances arbitrarily, this ensures a valid session. for p in range(10, 100): self._remote_xvnc_process.sendline('Xvnc :' + str(p)) result = self._remote_xvnc_process.expect([ 'created VNC server for screen 0', 'Server is already active for display', 'server already running', pexpect.TIMEOUT], self.TIMEOUT) # valid Xvnc session spawned on port, stop searching if result == 0: self._remote_display_port = p return # timeout while trying to start Xvnc, abort elif result == 3: raise XvfbXvnError("Cannot start Xvnc, unknown error.") # unable to find an open Xvnc port (very unlikely), abort raise XvfbXvnError("Cannot start Xvnc, no open display ports.") def _start_gazebo(self, ros_master_uri, models_path=None, gzserver_args=None): """ Start gazebo on the remote server """ if self._node is None or self._allocation_process is None: raise Exception("Cannot connect to a cluster node without a proper Job allocation.") if self._remote_display_port == -1: raise Exception("Gazebo needs a remote X Server running") notificator.info('Configuring the cluster node environment') self._gazebo_remote_process = self._spawn_vglconnect() # Kill any active gzservers (this should never happen). self._gazebo_remote_process.sendline('killall -9 gzserver') # source environment modules init file self._gazebo_remote_process.sendline('source /opt/rh/python27/enable') self._gazebo_remote_process.sendline('source /usr/share/Modules/init/bash 2> /dev/null') # configure environment variables and get project path proj_path = self._configure_environment(self._gazebo_remote_process) # loading the environment modules configuration files modules_path = proj_path + 'server-scripts/nrp-services-modules.sh' self._gazebo_remote_process.sendline('source %s' % modules_path) result = self._gazebo_remote_process.expect(['NRP modules loaded.', pexpect.TIMEOUT]) if result == 1: raise Exception("Error while configuring cluster node, gpfs may not be mounted." + str(self._gazebo_remote_process.after)) # configure variables after all module loads (that could overwrite values) self._gazebo_remote_process.sendline('export DISPLAY=:' + str(self._remote_display_port)) self._gazebo_remote_process.sendline('export ROS_MASTER_URI=' + ros_master_uri) # Use the appropriate dev or staging models based on this backend version self._gazebo_remote_process.sendline('export GAZEBO_MODEL_PATH=%s/models' % proj_path) # disable online (unreachable) model searching, only use local NRP models self._gazebo_remote_process.sendline('export GAZEBO_MODEL_DATABASE_URI=') # copy robot, if needed if models_path is not None: notificator.info("Copy robot to remote server") self._copy_to_remote(models_path) self._gazebo_remote_process.sendline('export GAZEBO_MODELS_PATH=' '{trg}:$GAZEBO_MODELS_PATH' .format(trg=self._tmp_dir)) # launch the watchdog inside the NRP virtualenv self._gazebo_remote_process.sendline('source %s/platform_venv/bin/activate' % proj_path) self._gazebo_remote_process.sendline('sleep 10 && python ' '-m hbp_nrp_watchdog.WatchdogServer -n Watchdog ' '-p gzserver -t /gazebo/health &') self._gazebo_remote_process.sendline('export WATCHDOG_PID=$!') self._gazebo_remote_process.sendline('deactivate') # activate ROS python venv to launch Gazebo self._gazebo_remote_process.sendline('source $ROS_PYTHON_VENV/bin/activate') # configure command line arguments for gzserver if provided if gzserver_args is not None: self._gazebo_remote_process.sendline('export GZSERVER_ARGS="%s"' % gzserver_args) # launch Gazebo with virtualgl, use -nodl to redirect native opengl calls to virtualgl notificator.info('Starting Gazebo server on the cluster node') lib_dir_env = "$ROS_HBP_PACKAGES_LIB_DIR" self._gazebo_remote_process.sendline('vglrun -nodl $GAZEBO_BIN_DIR/gzserver ' '$GZSERVER_ARGS ' '--pause ' '-s <lib_env>/libgazebo_ros_api_plugin.so ' '-s <lib_env>/libgazebo_ros_paths_plugin.so ' '-s <lib_env>/libgazebo_ros_recording_plugin.so ' '-s <lib_env>/libgazebo_ros_playback_plugin.so ' '--verbose'.replace("<lib_env>", lib_dir_env)) result = self._gazebo_remote_process.expect(['Gazebo multi-robot simulator', pexpect.TIMEOUT]) if result == 1: raise Exception("Error while starting gazebo: %s" % str(self._gazebo_remote_process.after)) def _start_watchdog_client(self): """ Starts the watchdog client """ self._watchdog_client = WatchdogClient("/gazebo/health", self._raise_gazebo_died) # Delay starting the watchdog client by 10s (because we delay the start of the watchdog 10s) self._watchdog_client.start(delay=10)
[docs] def start(self, ros_master_uri, models_path=None, gzserver_args=None): """ Start gzserver on the Lugano viz cluster """ try: self._allocate_job(reuse_nodes=False) # only one gzserver per cluster node self._start_fake_X() self._start_xvnc() self._start_gazebo(ros_master_uri, models_path, gzserver_args) self._start_watchdog_client() # pylint: disable=broad-except except Exception: logger.exception('Failure launching gzserver on remote node.') # always cleanup, but only raise the start exception up, not any shutdown errors try: self.stop() # pylint: disable=broad-except except Exception: pass raise
[docs] def try_extend(self, new_timeout): """" Verifies that the gazebo can accept the new simulation timeout Returns whether the timeout is accepted """ return self._allocation_time is None or new_timeout <= self._allocation_time
@property def gazebo_master_uri(self): """ Returns a string containing the gzserver master URI (like:'http://bbpviz001.cscs.ch:11345') """ if self._node is not None: return ('http://' + self._node + self.NODE_DOMAIN + ':' + str(self.DEFAULT_GZSERVER_PORT)) else: return None
[docs] def stop(self): # stop the local watchdog client before terminating the remote side if self._watchdog_client is not None: self._watchdog_client.stop() self._watchdog_client = None # cluster node cleanup (this can fail, but make sure we always release the job below) try: # terminate running remote watchdog, gzserver, and invoking bash shell if self._gazebo_remote_process: notificator.info('Stopping Gazebo server on the cluster node') self._gazebo_remote_process.sendcontrol('z') self._gazebo_remote_process.sendline('kill -v -n 9 $WATCHDOG_PID') self._gazebo_remote_process.sendline('killall -v -9 gzserver') self._gazebo_remote_process.expect([pexpect.TIMEOUT, 'Killed', 'gzserver: no process killed'], self.TIMEOUT) self._gazebo_remote_process.terminate() # directly terminate Xvnc process (not invoked via bash) if self._remote_xvnc_process: notificator.info('Stopping cluster node graphics server') self._remote_xvnc_process.terminate() # pylint: disable=broad-except except Exception: logger.exception('Error cleaning up cluster node.') finally: self._gazebo_remote_process = None self._remote_xvnc_process = None self._remote_display_port = -1 # SLURM cleanup and temporary folder deletion (must happen after any cluster cleanup as this # will deallocate the process) LuganoVizCluster.stop(self) # cleserver cleanup Xvfb, this is not critical if self._x_server_process: notificator.info('Stopping backend graphics server') self._x_server_process.terminate() self._x_server_process = None
[docs] def restart(self, ros_master_uri): notificator.info("Restarting Gazebo server on the cluster") self.stop() self.start(ros_master_uri)
[docs]class XvfbXvnError(Exception): """ This exception class is a marker for errors coming from Xvfb or Xvn. """ pass
def _get_roscore_master_uri(): """ Return roscore master URI. If the env variable ROS_MASTER_URI is not set, then construct it like this: http:// + local_ip + :11311 """ master_uri = os.environ.get("ROS_MASTER_URI") if not master_uri: local_ip = netifaces.ifaddresses('eth0')[netifaces.AF_INET][0]['addr'] master_uri = 'http://' + local_ip + ':11311' return master_uri # Useful to test out the code. if __name__ == '__main__': # pragma: no cover logger.setLevel(logging.DEBUG) log_format = '%(asctime)s [%(threadName)-12.12s] [%(name)-12.12s] [%(levelname)s] %(message)s' console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(logging.Formatter(log_format)) logger.setLevel(logging.DEBUG) logger.addHandler(console_handler) gazebo = LuganoVizClusterGazebo() gazebo.start(_get_roscore_master_uri()) time.sleep(100) gazebo.stop()
Copy to clipboard