<div dir="ltr"><div dir="ltr" class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><span><div dir="ltr" style="margin-left:0pt" align="left"><span><div dir="ltr" style="margin-left:0pt" align="left">From: Daniel Bailey <<a href="mailto:danielb@meshplusplus.com">danielb@meshplusplus.com</a>><br>Date: Fri, 29 May 2020 17:37:25 -0700<br>Subject: [PATCH] procd: add service instance watchdog<br><br>Added instance watchdog which will eventually either terminate<br>or respawn an instance depending on the instance respawn setting.<br><br>Added service ubus method 'watchdog' which services the watchdog<br>timer and allows update of the instance watchdog mode instance.<br><br>Three modes: disabled, passive, active. Presently, only disabled<br>and passive modes are implemented.<br><br>Disabled: cancels watchdog timer set for a given instance.<br><br>Passive: sets a instance timer which must be serviced or the<br>instance will be stopped/restarted depending upon the instance<br>respawn value when the timer expires.<br><br>Active (to be implemented): requires an additional service 'endpoint'<br>parameter. Upon watchdog timer expiry, procd will query the endpoint<br>to determine whether the instance is alive. If the instance does not<br>answer, procd will terminate or respawn the instance depending on<br>the instance respawn setting.<br><br>Signed-off-by: Daniel Bailey <<a href="mailto:danielb@meshplusplus.com">danielb@meshplusplus.com</a>><br>---<br> service/instance.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++<br> service/instance.h | 15 ++++++++++<br> service/service.c  | 68 ++++++++++++++++++++++++++++++++++++++++++++++<br> 3 files changed, 151 insertions(+)<br><br>diff --git a/service/instance.c b/service/instance.c<br>index 142208a..8560a95 100644<br>--- a/service/instance.c<br>+++ b/service/instance.c<br>@@ -65,6 +65,7 @@ enum {<br>       INSTANCE_ATTR_EXTROOT,<br>       INSTANCE_ATTR_OVERLAYDIR,<br>    INSTANCE_ATTR_TMPOVERLAYSIZE,<br>+        INSTANCE_ATTR_WATCHDOG,<br>      __INSTANCE_ATTR_MAX<br> };<br><br>@@ -95,6 +96,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {<br>     [INSTANCE_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING },<br>      [INSTANCE_ATTR_OVERLAYDIR] = { "overlaydir", BLOBMSG_TYPE_STRING },<br>        [INSTANCE_ATTR_TMPOVERLAYSIZE] = { "tmpoverlaysize", BLOBMSG_TYPE_STRING },<br>+        [INSTANCE_ATTR_WATCHDOG] = { "watchdog", BLOBMSG_TYPE_ARRAY },<br> };<br><br> enum {<br>@@ -546,6 +548,11 @@ instance_start(struct service_instance *in)<br>                 fcntl(epipe[0], F_SETFD, FD_CLOEXEC);<br>        }<br><br>+  if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {<br>+               uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);<br>+                DEBUG(2, "Started instance %s::%s watchdog timer : timeout = %d\n", in->srv->name, in->name, in->watchdog.freq);<br>+       }<br>+<br>         service_event("instance.start", in->srv->name, in->name);<br> }<br><br>@@ -693,6 +700,7 @@ instance_exit(struct uloop_process *p, int ret)<br><br>   in->exit_code = instance_exit_code(ret);<br>  uloop_timeout_cancel(&in->timeout);<br>+   uloop_timeout_cancel(&in->watchdog.timeout);<br>  service_event("instance.stop", in->srv->name, in->name);<br><br>        if (in->halt) {<br>@@ -752,6 +760,19 @@ instance_restart(struct service_instance *in)<br>       uloop_timeout_set(&in->timeout, in->term_timeout * 1000);<br> }<br><br>+static void<br>+instance_watchdog(struct uloop_timeout *t)<br>+{<br>+    struct service_instance *in = container_of(t, struct service_instance, watchdog.timeout);<br>+<br>+ DEBUG(3, "instance %s::%s watchdog timer expired\n", in->srv->name, in->name);<br>+<br>+ if (in->respawn)<br>+          instance_restart(in);<br>+        else<br>+         instance_stop(in, true);<br>+}<br>+<br> static bool string_changed(const char *a, const char *b)<br> {<br>     return !((!a && !b) || (a && b && !strcmp(a, b)));<br>@@ -817,6 +838,12 @@ instance_config_changed(struct service_instance *in, struct service_instance *in<br>    if (!blobmsg_list_equal(&in->errors, &in_new->errors))<br>                 return true;<br><br>+       if (in->watchdog.mode != in_new->watchdog.mode)<br>+                return true;<br>+<br>+      if (in->watchdog.freq != in_new->watchdog.freq)<br>+                return true;<br>+<br>      return false;<br> }<br><br>@@ -1170,6 +1197,36 @@ instance_config_parse(struct service_instance *in)<br>                      DEBUG(3, "unknown syslog facility '%s' given, using default (LOG_DAEMON)\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));<br>      }<br><br>+  if (tb[INSTANCE_ATTR_WATCHDOG]) {<br>+            int i = 0;<br>+           uint32_t vals[2] = { 0, 30 };<br>+<br>+             blobmsg_for_each_attr(cur2, tb[INSTANCE_ATTR_WATCHDOG], rem) {<br>+                       if (i >= 2)<br>+                               break;<br>+<br>+                    vals[i] = atoi(blobmsg_get_string(cur2));<br>+                    i++;<br>+         }<br>+<br>+         // TODO(danielb): change mode integers to strings (0 = disabled, 1 = passive, 2 = active)<br>+            if (vals[0] >= 0 && vals[0] < __INSTANCE_WATCHDOG_MODE_MAX) {<br>+                  in->watchdog.mode = vals[0];<br>+                      DEBUG(3, "setting watchdog mode (%d)\n", vals[0]);<br>+         } else {<br>+                     in->watchdog.mode = 0;<br>+                    DEBUG(3, "unknown watchdog mode (%d) given, using default (0)\n", vals[0]);<br>+                }<br>+<br>+         if (vals[1] > 0) {<br>+                        in->watchdog.freq = vals[1];<br>+                      DEBUG(3, "setting watchdog timeout (%d)\n", vals[0]);<br>+              } else {<br>+                     in->watchdog.freq = 30;<br>+                   DEBUG(3, "invalid watchdog timeout (%d) given, using default (30)\n", vals[1]);<br>+            }<br>+    }<br>+<br>         return true;<br> }<br><br>@@ -1255,6 +1312,7 @@ instance_free(struct service_instance *in)<br>        instance_free_stdio(in);<br>     uloop_process_delete(&in->proc);<br>      uloop_timeout_cancel(&in->timeout);<br>+   uloop_timeout_cancel(&in->watchdog.timeout);<br>  trigger_del(in);<br>     watch_del(in);<br>       instance_config_cleanup(in);<br>@@ -1308,6 +1366,9 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *<br>         blobmsg_list_simple_init(&in->limits);<br>        blobmsg_list_simple_init(&in->errors);<br>        blobmsg_list_simple_init(&in->jail.mount);<br>+<br>+ in->watchdog.timeout.cb = instance_watchdog;<br>+<br>   in->valid = instance_config_parse(in);<br> }<br><br>@@ -1425,5 +1486,12 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)<br>    if (verbose && in->trigger)<br>               blobmsg_add_blob(b, in->trigger);<br><br>+       if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {<br>+               void *r = blobmsg_open_table(b, "watchdog");<br>+               blobmsg_add_u32(b, "mode", in->watchdog.mode);<br>+          blobmsg_add_u32(b, "timeout", in->watchdog.freq);<br>+               blobmsg_close_table(b, r);<br>+   }<br>+<br>         blobmsg_close_table(b, i);<br> }<br>diff --git a/service/instance.h b/service/instance.h<br>index 4400cd4..590f931 100644<br>--- a/service/instance.h<br>+++ b/service/instance.h<br>@@ -23,6 +23,19 @@<br> #define RESPAWN_ERROR   (5 * 60)<br> #define SIGNALLED_OFFSET 128<br><br>+typedef enum instance_watchdog {<br>+        INSTANCE_WATCHDOG_MODE_DISABLED,<br>+     INSTANCE_WATCHDOG_MODE_PASSIVE,<br>+      INSTANCE_WATCHDOG_MODE_ACTIVE,<br>+       __INSTANCE_WATCHDOG_MODE_MAX,<br>+} instance_watchdog_mode_t;<br>+<br>+struct watchdog {<br>+   instance_watchdog_mode_t mode;<br>+       uint32_t freq;<br>+       struct uloop_timeout timeout;<br>+};<br>+<br> struct jail {<br>       bool procfs;<br>         bool sysfs;<br>@@ -94,6 +107,8 @@ struct service_instance {<br>    struct blobmsg_list file;<br>    struct blobmsg_list limits;<br>  struct blobmsg_list errors;<br>+<br>+       struct watchdog watchdog;<br> };<br><br> void instance_start(struct service_instance *in);<br>diff --git a/service/service.c b/service/service.c<br>index fcf0215..d9249a3 100644<br>--- a/service/service.c<br>+++ b/service/service.c<br>@@ -727,6 +727,73 @@ service_get_data(struct ubus_context *ctx, struct ubus_object *obj,<br>        return 0;<br> }<br><br>+enum {<br>+    SERVICE_WATCHDOG_MODE,<br>+       SERVICE_WATCHDOG_TIMEOUT,<br>+    SERVICE_WATCHDOG_NAME,<br>+       SERVICE_WATCHDOG_INSTANCE,<br>+   __SERVICE_WATCHDOG_MAX,<br>+};<br>+<br>+static const struct blobmsg_policy service_watchdog_policy[__SERVICE_WATCHDOG_MAX] = {<br>+     [SERVICE_WATCHDOG_MODE] = { "mode", BLOBMSG_TYPE_INT32 },<br>+  [SERVICE_WATCHDOG_NAME] = { "name", BLOBMSG_TYPE_STRING },<br>+ [SERVICE_WATCHDOG_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },<br>+    [SERVICE_WATCHDOG_INSTANCE] = { "instance", BLOBMSG_TYPE_STRING },<br>+};<br>+<br>+static int<br>+service_handle_watchdog(struct ubus_context *ctx, struct ubus_object *obj,<br>+                   struct ubus_request_data *req, const char *method,<br>+                 struct blob_attr *msg)<br>+{<br>+     struct blob_attr *tb[__SERVICE_WATCHDOG_MAX] = {0};<br>+  struct service *s;<br>+   struct blob_attr *cur;<br>+       struct service_instance *in;<br>+<br>+      blobmsg_parse(service_watchdog_policy, __SERVICE_WATCHDOG_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));<br>+        cur = tb[SERVICE_WATCHDOG_NAME];<br>+     if (!cur)<br>+            return UBUS_STATUS_NOT_FOUND;<br>+<br>+     s = avl_find_element(&services, blobmsg_data(cur), s, avl);<br>+      if (!s)<br>+              return UBUS_STATUS_NOT_FOUND;<br>+<br>+     cur = tb[SERVICE_WATCHDOG_INSTANCE];<br>+ if (!cur)<br>+            return UBUS_STATUS_NOT_FOUND;<br>+<br>+     in = vlist_find(&s->instances, blobmsg_data(cur), in, node);<br>+  if (!in) {<br>+           ERROR("instance %s not found\n", blobmsg_get_string(cur));<br>+         return UBUS_STATUS_NOT_FOUND;<br>+        }<br>+<br>+ // TODO(danielb): change mode from u32 to string (0 = disabled, 1 = passive, 2 = active)<br>+     if (tb[SERVICE_WATCHDOG_MODE])<br>+               in->watchdog.mode = blobmsg_get_u32(tb[SERVICE_WATCHDOG_MODE]);<br>+<br>+        if (tb[SERVICE_WATCHDOG_TIMEOUT])<br>+            in->watchdog.freq = blobmsg_get_u32(tb[SERVICE_WATCHDOG_TIMEOUT]);<br>+<br>+     if (in->watchdog.mode == INSTANCE_WATCHDOG_MODE_DISABLED)<br>+         uloop_timeout_cancel(&in->watchdog.timeout);<br>+  else<br>+         uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);<br>+<br>+     blob_buf_init(&b, 0);<br>+    blobmsg_add_string(&b, "name", blobmsg_get_string(tb[SERVICE_WATCHDOG_NAME]));<br>+ blobmsg_add_string(&b, "instance", blobmsg_get_string(tb[SERVICE_WATCHDOG_INSTANCE]));<br>+ blobmsg_add_u32(&b, "mode", in->watchdog.mode);<br>+     blobmsg_add_u32(&b, "timeout", in->watchdog.freq);<br>+<br>+       ubus_send_reply(ctx, req, b.head);<br>+<br>+        return UBUS_STATUS_OK;<br>+}<br>+<br> static int<br> container_handle_console(struct ubus_context *ctx, struct ubus_object *obj,<br>                    struct ubus_request_data *req, const char *method,<br>@@ -797,6 +864,7 @@ static struct ubus_method main_object_methods[] = {<br>         UBUS_METHOD("validate", service_handle_validate, validate_policy),<br>         UBUS_METHOD("get_data", service_get_data, get_data_policy),<br>        UBUS_METHOD("state", service_handle_state, service_state_attrs),<br>+   UBUS_METHOD("watchdog", service_handle_watchdog, service_watchdog_policy),<br> };<br><br> static struct ubus_object_type main_object_type =<br>--<br>2.25.1<br><br></div></span><span></span></div></span></div></div></div>