[ARVADOS] updated: c193d814c22e2a4227c7f49e76b0d9b589cff4be

Git user git at public.curoverse.com
Tue May 17 16:59:43 EDT 2016


Summary of changes:
 services/nodemanager/arvnodeman/baseactor.py | 12 +++++-------
 services/nodemanager/arvnodeman/config.py    |  2 +-
 services/nodemanager/arvnodeman/launcher.py  |  9 ++++++++-
 services/nodemanager/tests/test_failure.py   | 18 +++++++++---------
 4 files changed, 23 insertions(+), 18 deletions(-)

       via  c193d814c22e2a4227c7f49e76b0d9b589cff4be (commit)
      from  1fd5716e1714337b6ff96f6725e1f22c7a6ceb65 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.


commit c193d814c22e2a4227c7f49e76b0d9b589cff4be
Author: Peter Amstutz <peter.amstutz at curoverse.com>
Date:   Tue May 17 16:59:20 2016 -0400

    8236: Restore os.killpg().  Create a new process group so that it won't kill
    the parent process by accident.  Watchdog process now only monitors specific
    actors.

diff --git a/services/nodemanager/arvnodeman/baseactor.py b/services/nodemanager/arvnodeman/baseactor.py
index 840ba4c..b48e007 100644
--- a/services/nodemanager/arvnodeman/baseactor.py
+++ b/services/nodemanager/arvnodeman/baseactor.py
@@ -84,7 +84,7 @@ class BaseNodeManagerActor(pykka.ThreadingActor):
         if (exception_type in (threading.ThreadError, MemoryError) or
             exception_type is OSError and exception_value.errno == errno.ENOMEM):
             lg.critical("Unhandled exception is a fatal error, killing Node Manager")
-            os.kill(os.getpid(), signal.SIGQUIT)
+            os.killpg(os.getpgid(0), signal.SIGKILL)
 
     def ping(self):
         return True
@@ -94,24 +94,22 @@ class WatchdogActor(pykka.ThreadingActor):
     def __init__(self, timeout, *args, **kwargs):
          super(pykka.ThreadingActor, self).__init__(*args, **kwargs)
          self.timeout = timeout
+         self.actors = [a.proxy() for a in args]
          self.actor_ref = TellableActorRef(self)
          self._later = self.actor_ref.tell_proxy()
 
     def kill_self(self, act):
         lg = getattr(self, "_logger", logging)
         lg.critical("Actor %s watchdog ping time out, killing Node Manager", act)
-        os.kill(os.getpid(), signal.SIGQUIT)
+        os.killpg(os.getpgid(0), signal.SIGKILL)
 
     def on_start(self):
         self._later.run()
 
     def run(self):
-        actors = pykka.ActorRegistry.get_all()
-        for a in actors:
-            if a.actor_class is WatchdogActor:
-                continue
+        for a in self.actors:
             try:
-                a.proxy().ping().get(self.timeout)
+                a.ping().get(self.timeout)
             except pykka.ActorDeadError:
                 pass
             except pykka.Timeout:
diff --git a/services/nodemanager/arvnodeman/config.py b/services/nodemanager/arvnodeman/config.py
index 30c82e7..b54461c 100644
--- a/services/nodemanager/arvnodeman/config.py
+++ b/services/nodemanager/arvnodeman/config.py
@@ -45,7 +45,7 @@ class NodeManagerConfig(ConfigParser.SafeConfigParser):
                        'max_total_price': '0',
                        'boot_fail_after': str(sys.maxint),
                        'node_stale_after': str(60 * 60 * 2),
-                       'watchdog': 600},
+                       'watchdog': '600'},
             'Logging': {'file': '/dev/stderr',
                         'level': 'WARNING'},
         }.iteritems():
diff --git a/services/nodemanager/arvnodeman/launcher.py b/services/nodemanager/arvnodeman/launcher.py
index 4554c4c..833da5d 100644
--- a/services/nodemanager/arvnodeman/launcher.py
+++ b/services/nodemanager/arvnodeman/launcher.py
@@ -99,6 +99,9 @@ def main(args=None):
     args = parse_cli(args)
     config = load_config(args.config)
 
+    # Create a new process group.
+    os.setsid()
+
     if not args.foreground:
         daemon.DaemonContext().open()
     for sigcode in [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]:
@@ -126,7 +129,11 @@ def main(args=None):
             node_setup, node_shutdown, node_monitor,
             max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
 
-        WatchdogActor.start(config.getint('Daemon', 'watchdog'))
+        WatchdogActor.start(config.getint('Daemon', 'watchdog'),
+                            cloud_node_poller.actor_ref,
+                            arvados_node_poller.actor_ref,
+                            job_queue_poller.actor_ref,
+                            node_daemon.actor_ref)
 
         signal.pause()
         daemon_stopped = node_daemon.actor_ref.actor_stopped.is_set
diff --git a/services/nodemanager/tests/test_failure.py b/services/nodemanager/tests/test_failure.py
index 0a121b6..f543f64 100644
--- a/services/nodemanager/tests/test_failure.py
+++ b/services/nodemanager/tests/test_failure.py
@@ -32,24 +32,24 @@ class BogusActor(arvnodeman.baseactor.BaseNodeManagerActor):
 class ActorUnhandledExceptionTest(unittest.TestCase):
     def test_fatal_error(self):
         for e in (MemoryError(), threading.ThreadError(), OSError(errno.ENOMEM, "")):
-            with mock.patch('os.kill') as kill_mock:
+            with mock.patch('os.killpg') as killpg_mock:
                 act = BogusActor.start(e).tell_proxy()
                 act.doStuff()
                 act.actor_ref.stop(block=True)
-                self.assertTrue(kill_mock.called)
+                self.assertTrue(killpg_mock.called)
 
-    @mock.patch('os.kill')
-    def test_nonfatal_error(self, kill_mock):
+    @mock.patch('os.killpg')
+    def test_nonfatal_error(self, killpg_mock):
         act = BogusActor.start(OSError(errno.ENOENT, "")).tell_proxy()
         act.doStuff()
         act.actor_ref.stop(block=True)
-        self.assertFalse(kill_mock.called)
+        self.assertFalse(killpg_mock.called)
 
 class WatchdogActorTest(unittest.TestCase):
-    @mock.patch('os.kill')
-    def test_time_timout(self, kill_mock):
+    @mock.patch('os.killpg')
+    def test_time_timout(self, killpg_mock):
         act = BogusActor.start(OSError(errno.ENOENT, ""))
-        watch = arvnodeman.baseactor.WatchdogActor.start(1)
+        watch = arvnodeman.baseactor.WatchdogActor.start(1, act)
         watch.stop(block=True)
         act.stop(block=True)
-        self.assertTrue(kill_mock.called)
+        self.assertTrue(killpg_mock.called)

-----------------------------------------------------------------------


hooks/post-receive
-- 




More information about the arvados-commits mailing list