[ARVADOS] updated: c193d814c22e2a4227c7f49e76b0d9b589cff4be
Git user
git at public.curoverse.com
Tue May 17 16:59:43 EDT 2016
Summary of changes:
services/nodemanager/arvnodeman/baseactor.py | 12 +++++-------
services/nodemanager/arvnodeman/config.py | 2 +-
services/nodemanager/arvnodeman/launcher.py | 9 ++++++++-
services/nodemanager/tests/test_failure.py | 18 +++++++++---------
4 files changed, 23 insertions(+), 18 deletions(-)
via c193d814c22e2a4227c7f49e76b0d9b589cff4be (commit)
from 1fd5716e1714337b6ff96f6725e1f22c7a6ceb65 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
commit c193d814c22e2a4227c7f49e76b0d9b589cff4be
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date: Tue May 17 16:59:20 2016 -0400
8236: Restore os.killpg(). Create a new process group so that it won't kill
the parent process by accident. Watchdog process now only monitors specific
actors.
diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
index 840ba4c..b48e007 100644
--- a/services/nodemanager/arvnodeman/baseactor.py
+++ b/services/nodemanager/arvnodeman/baseactor.py
@@ -84,7 +84,7 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
if (exception_type in (threading.ThreadError, MemoryError) or
exception_type is OSError and exception_value.errno == errno.ENOMEM):
lg.critical("Unhandled exception is a fatal error, killing Node Manager")
- os.kill(os.getpid(), signal.SIGQUIT)
+ os.killpg(os.getpgid(0), signal.SIGKILL)
def ping(self):
return True
@@ -94,24 +94,22 @@ class WatchdogActor(pykka.ThreadingActor):
def __init__(self, timeout, *args, **kwargs):
super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
self.timeout = timeout
+ self.actors = [a.proxy() for a in args]
self.actor_ref = TellableActorRef(self)
self._later = self.actor_ref.tell_proxy()
def kill_self(self, act):
lg = getattr(self, "_logger", logging)
lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
- os.kill(os.getpid(), signal.SIGQUIT)
+ os.killpg(os.getpgid(0), signal.SIGKILL)
def on_start(self):
self._later.run()
def run(self):
- actors = pykka.ActorRegistry.get_all()
- for a in actors:
- if a.actor_class is WatchdogActor:
- continue
+ for a in self.actors:
try:
- a.proxy().ping().get(self.timeout)
+ a.ping().get(self.timeout)
except pykka.ActorDeadError:
pass
except pykka.Timeout:
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index 30c82e7..b54461c 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -45,7 +45,7 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
'max_total_price': '0',
'boot_fail_after': str(sys.maxint),
'node_stale_after': str(60 * 60 * 2),
- 'watchdog': 600},
+ 'watchdog': '600'},
'Logging': {'file': '/dev/stderr',
'level': 'WARNING'},
}.iteritems():
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 4554c4c..833da5d 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -99,6 +99,9 @@ def main(args=None):
args = parse_cli(args)
config = load_config(args.config)
+ # Create a new process group.
+ os.setsid()
+
if not args.foreground:
daemon.DaemonContext().open()
for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
@@ -126,7 +129,11 @@ def main(args=None):
node_setup, node_shutdown, node_monitor,
max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
- WatchdogActor.start(config.getint('Daemon', 'watchdog'))
+ WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+ cloud_node_poller.actor_ref,
+ arvados_node_poller.actor_ref,
+ job_queue_poller.actor_ref,
+ node_daemon.actor_ref)
signal.pause()
daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
diff --git a/services/nodemanager/tests/test_failure.py b/services/nodemanager/tests/test_failure.py
index 0a121b6..f543f64 100644
--- a/services/nodemanager/tests/test_failure.py
+++ b/services/nodemanager/tests/test_failure.py
@@ -32,24 +32,24 @@ class BogusActor(arvnodeman.baseactor.BaseNodeManagerActor):
class ActorUnhandledExceptionTest(unittest.TestCase):
def test_fatal_error(self):
for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
- with mock.patch('os.kill') as kill_mock:
+ with mock.patch('os.killpg') as killpg_mock:
act = BogusActor.start(e).tell_proxy()
act.doStuff()
act.actor_ref.stop(block=True)
- self.assertTrue(kill_mock.called)
+ self.assertTrue(killpg_mock.called)
- @mock.patch('os.kill')
- def test_nonfatal_error(self, kill_mock):
+ @mock.patch('os.killpg')
+ def test_nonfatal_error(self, killpg_mock):
act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
act.doStuff()
act.actor_ref.stop(block=True)
- self.assertFalse(kill_mock.called)
+ self.assertFalse(killpg_mock.called)
class WatchdogActorTest(unittest.TestCase):
- @mock.patch('os.kill')
- def test_time_timout(self, kill_mock):
+ @mock.patch('os.killpg')
+ def test_time_timout(self, killpg_mock):
act = BogusActor.start(OSError(errno.ENOENT, ""))
- watch = arvnodeman.baseactor.WatchdogActor.start(1)
+ watch = arvnodeman.baseactor.WatchdogActor.start(1, act)
watch.stop(block=True)
act.stop(block=True)
- self.assertTrue(kill_mock.called)
+ self.assertTrue(killpg_mock.called)
-----------------------------------------------------------------------
hooks/post-receive
--
More information about the arvados-commits
mailing list