root/daemons/controld/controld_join_client.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_dc_expected
  2. do_cl_join_query
  3. do_cl_join_announce
  4. do_cl_join_offer_respond
  5. join_query_callback
  6. set_join_state
  7. update_conn_host_cache
  8. do_cl_join_finalize_respond

   1 /*
   2  * Copyright 2004-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/cib.h>
  14 #include <crm/msg_xml.h>
  15 #include <crm/common/xml.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data);
  20 
  21 extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
  22 
  23 /*!
  24  * \internal
  25  * \brief Remember if DC is shutting down as we join
  26  *
  27  * If we're joining while the current DC is shutting down, update its expected
  28  * state, so we don't fence it if we become the new DC. (We weren't a peer
  29  * when it broadcast its shutdown request.)
  30  *
  31  * \param[in] msg  A join message from the DC
  32  */
  33 static void
  34 update_dc_expected(const xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
  35 {
  36     if ((controld_globals.dc_name != NULL)
  37         && pcmk__xe_attr_is_true(msg, F_CRM_DC_LEAVING)) {
  38         crm_node_t *dc_node = crm_get_peer(0, controld_globals.dc_name);
  39 
  40         pcmk__update_peer_expected(__func__, dc_node, CRMD_JOINSTATE_DOWN);
  41     }
  42 }
  43 
  44 /*      A_CL_JOIN_QUERY         */
  45 /* is there a DC out there? */
  46 void
  47 do_cl_join_query(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  48                  enum crmd_fsa_cause cause,
  49                  enum crmd_fsa_state cur_state,
  50                  enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  51 {
  52     xmlNode *req = create_request(CRM_OP_JOIN_ANNOUNCE, NULL, NULL,
  53                                   CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
  54 
  55     sleep(1);                   // Give the cluster layer time to propagate to the DC
  56     update_dc(NULL);            /* Unset any existing value so that the result is not discarded */
  57     crm_debug("Querying for a DC");
  58     send_cluster_message(NULL, crm_msg_crmd, req, FALSE);
  59     free_xml(req);
  60 }
  61 
  62 /*       A_CL_JOIN_ANNOUNCE     */
  63 
  64 /* this is kind of a workaround for the fact that we may not be around or
  65  * are otherwise unable to reply when the DC sends out A_DC_JOIN_OFFER_ALL
  66  */
  67 void
  68 do_cl_join_announce(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  69                     enum crmd_fsa_cause cause,
  70                     enum crmd_fsa_state cur_state,
  71                     enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  72 {
  73     /* don't announce if we're in one of these states */
  74     if (cur_state != S_PENDING) {
  75         crm_warn("Not announcing cluster join because in state %s",
  76                  fsa_state2string(cur_state));
  77         return;
  78     }
  79 
  80     if (!pcmk_is_set(controld_globals.fsa_input_register, R_STARTING)) {
  81         /* send as a broadcast */
  82         xmlNode *req = create_request(CRM_OP_JOIN_ANNOUNCE, NULL, NULL,
  83                                       CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
  84 
  85         crm_debug("Announcing availability");
  86         update_dc(NULL);
  87         send_cluster_message(NULL, crm_msg_crmd, req, FALSE);
  88         free_xml(req);
  89 
  90     } else {
  91         /* Delay announce until we have finished local startup */
  92         crm_warn("Delaying announce of cluster join until local startup is complete");
  93         return;
  94     }
  95 }
  96 
  97 static int query_call_id = 0;
  98 
  99 /*       A_CL_JOIN_REQUEST      */
 100 /* aka. accept the welcome offer */
 101 void
 102 do_cl_join_offer_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 103                          enum crmd_fsa_cause cause,
 104                          enum crmd_fsa_state cur_state,
 105                          enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 106 {
 107     cib_t *cib_conn = controld_globals.cib_conn;
 108 
 109     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 110     const char *welcome_from;
 111     const char *join_id;
 112 
 113     CRM_CHECK(input != NULL, return);
 114 
 115 #if 0
 116     if (we are sick) {
 117         log error;
 118 
 119         /* save the request for later? */
 120         return;
 121     }
 122 #endif
 123 
 124     welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);
 125     join_id = crm_element_value(input->msg, F_CRM_JOIN_ID);
 126     crm_trace("Accepting cluster join offer from node %s "CRM_XS" join-%s",
 127               welcome_from, crm_element_value(input->msg, F_CRM_JOIN_ID));
 128 
 129     /* we only ever want the last one */
 130     if (query_call_id > 0) {
 131         crm_trace("Cancelling previous join query: %d", query_call_id);
 132         remove_cib_op_callback(query_call_id, FALSE);
 133         query_call_id = 0;
 134     }
 135 
 136     if (update_dc(input->msg) == FALSE) {
 137         crm_warn("Discarding cluster join offer from node %s (expected %s)",
 138                  welcome_from, controld_globals.dc_name);
 139         return;
 140     }
 141 
 142     update_dc_expected(input->msg);
 143 
 144     query_call_id = cib_conn->cmds->query(cib_conn, NULL, NULL,
 145                                           cib_scope_local|cib_no_children);
 146     fsa_register_cib_callback(query_call_id, strdup(join_id),
 147                               join_query_callback);
 148     crm_trace("Registered join query callback: %d", query_call_id);
 149 
 150     controld_set_fsa_action_flags(A_DC_TIMER_STOP);
 151     controld_trigger_fsa();
 152 }
 153 
 154 void
 155 join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 156 {
 157     char *join_id = user_data;
 158     xmlNode *generation = create_xml_node(NULL, XML_CIB_TAG_GENERATION_TUPPLE);
 159 
 160     CRM_LOG_ASSERT(join_id != NULL);
 161 
 162     if (query_call_id != call_id) {
 163         crm_trace("Query %d superseded", call_id);
 164         goto done;
 165     }
 166 
 167     query_call_id = 0;
 168     if(rc != pcmk_ok || output == NULL) {
 169         crm_err("Could not retrieve version details for join-%s: %s (%d)",
 170                 join_id, pcmk_strerror(rc), rc);
 171         register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
 172 
 173     } else if (controld_globals.dc_name == NULL) {
 174         crm_debug("Membership is in flux, not continuing join-%s", join_id);
 175 
 176     } else {
 177         xmlNode *reply = NULL;
 178 
 179         crm_debug("Respond to join offer join-%s from %s",
 180                   join_id, controld_globals.dc_name);
 181         copy_in_properties(generation, output);
 182 
 183         reply = create_request(CRM_OP_JOIN_REQUEST, generation,
 184                                controld_globals.dc_name, CRM_SYSTEM_DC,
 185                                CRM_SYSTEM_CRMD, NULL);
 186 
 187         crm_xml_add(reply, F_CRM_JOIN_ID, join_id);
 188         crm_xml_add(reply, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
 189         send_cluster_message(crm_get_peer(0, controld_globals.dc_name),
 190                              crm_msg_crmd, reply, TRUE);
 191         free_xml(reply);
 192     }
 193 
 194   done:
 195     free_xml(generation);
 196 }
 197 
 198 static void
 199 set_join_state(const char * start_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 200 {
 201     if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
 202         crm_notice("Forcing node %s to join in %s state per configured "
 203                    "environment", controld_globals.our_nodename, start_state);
 204         cib__update_node_attr(controld_globals.logger_out,
 205                               controld_globals.cib_conn, cib_sync_call,
 206                               XML_CIB_TAG_NODES, controld_globals.our_uuid,
 207                               NULL, NULL, NULL, "standby", "on", NULL, NULL);
 208 
 209     } else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
 210         crm_notice("Forcing node %s to join in %s state per configured "
 211                    "environment", controld_globals.our_nodename, start_state);
 212         cib__update_node_attr(controld_globals.logger_out,
 213                               controld_globals.cib_conn, cib_sync_call,
 214                               XML_CIB_TAG_NODES, controld_globals.our_uuid,
 215                               NULL, NULL, NULL, "standby", "off", NULL, NULL);
 216 
 217     } else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
 218         crm_debug("Not forcing a starting state on node %s",
 219                   controld_globals.our_nodename);
 220 
 221     } else {
 222         crm_warn("Unrecognized start state '%s', using 'default' (%s)",
 223                  start_state, controld_globals.our_nodename);
 224     }
 225 }
 226 
 227 static int
 228 update_conn_host_cache(xmlNode *node, void *userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 229 {
 230     const char *remote = crm_element_value(node, XML_ATTR_ID);
 231     const char *conn_host = crm_element_value(node, PCMK__XA_CONN_HOST);
 232     const char *state = crm_element_value(node, XML_CIB_TAG_STATE);
 233 
 234     crm_node_t *remote_peer = crm_remote_peer_get(remote);
 235 
 236     if (remote_peer == NULL) {
 237         return pcmk_rc_ok;
 238     }
 239 
 240     if (conn_host != NULL) {
 241         pcmk__str_update(&remote_peer->conn_host, conn_host);
 242     }
 243 
 244     if (state != NULL) {
 245         pcmk__update_peer_state(__func__, remote_peer, state, 0);
 246     }
 247 
 248     return pcmk_rc_ok;
 249 }
 250 
 251 /*      A_CL_JOIN_RESULT        */
 252 /* aka. this is notification that we have (or have not) been accepted */
 253 void
 254 do_cl_join_finalize_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 255                             enum crmd_fsa_cause cause,
 256                             enum crmd_fsa_state cur_state,
 257                             enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 258 {
 259     xmlNode *tmp1 = NULL;
 260     gboolean was_nack = TRUE;
 261     static gboolean first_join = TRUE;
 262     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 263     const char *start_state = pcmk__env_option(PCMK__ENV_NODE_START_STATE);
 264 
 265     int join_id = -1;
 266     const char *op = crm_element_value(input->msg, F_CRM_TASK);
 267     const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);
 268 
 269     if (!pcmk__str_eq(op, CRM_OP_JOIN_ACKNAK, pcmk__str_casei)) {
 270         crm_trace("Ignoring op=%s message", op);
 271         return;
 272     }
 273 
 274     /* calculate if it was an ack or a nack */
 275     if (pcmk__xe_attr_is_true(input->msg, CRM_OP_JOIN_ACKNAK)) {
 276         was_nack = FALSE;
 277     }
 278 
 279     crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);
 280 
 281     if (was_nack) {
 282         crm_err("Shutting down because cluster join with leader %s failed "
 283                 CRM_XS" join-%d NACK'd", welcome_from, join_id);
 284         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 285         controld_set_fsa_input_flags(R_STAYDOWN);
 286         return;
 287     }
 288 
 289     if (!AM_I_DC
 290         && pcmk__str_eq(welcome_from, controld_globals.our_nodename,
 291                         pcmk__str_casei)) {
 292         crm_warn("Discarding our own welcome - we're no longer the DC");
 293         return;
 294     }
 295 
 296     if (update_dc(input->msg) == FALSE) {
 297         crm_warn("Discarding %s from node %s (expected from %s)",
 298                  op, welcome_from, controld_globals.dc_name);
 299         return;
 300     }
 301 
 302     update_dc_expected(input->msg);
 303 
 304     /* record the node's feature set as a transient attribute */
 305     update_attrd(controld_globals.our_nodename, CRM_ATTR_FEATURE_SET,
 306                  CRM_FEATURE_SET, NULL, FALSE);
 307 
 308     /* send our status section to the DC */
 309     tmp1 = controld_query_executor_state();
 310     if (tmp1 != NULL) {
 311         xmlNode *remotes = NULL;
 312         xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1,
 313                                         controld_globals.dc_name, CRM_SYSTEM_DC,
 314                                         CRM_SYSTEM_CRMD, NULL);
 315 
 316         crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);
 317 
 318         crm_debug("Confirming join-%d: sending local operation history to %s",
 319                   join_id, controld_globals.dc_name);
 320 
 321         /*
 322          * If this is the node's first join since the controller started on it,
 323          * set its initial state (standby or member) according to the user's
 324          * preference.
 325          *
 326          * We do not clear the LRM history here. Even if the DC failed to do it
 327          * when we last left, removing them here creates a race condition if the
 328          * controller is being recovered. Instead of a list of active resources
 329          * from the executor, we may end up with a blank status section. If we
 330          * are _NOT_ lucky, we will probe for the "wrong" instance of anonymous
 331          * clones and end up with multiple active instances on the machine.
 332          */
 333         if (first_join
 334             && !pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
 335 
 336             first_join = FALSE;
 337             if (start_state) {
 338                 set_join_state(start_state);
 339             }
 340         }
 341 
 342         send_cluster_message(crm_get_peer(0, controld_globals.dc_name),
 343                              crm_msg_crmd, reply, TRUE);
 344         free_xml(reply);
 345 
 346         if (AM_I_DC == FALSE) {
 347             register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE,
 348                                    __func__);
 349         }
 350 
 351         free_xml(tmp1);
 352 
 353         /* Update the remote node cache with information about which node
 354          * is hosting the connection.
 355          */
 356         remotes = pcmk__xe_match(input->msg, XML_CIB_TAG_NODES, NULL, NULL);
 357         if (remotes != NULL) {
 358             pcmk__xe_foreach_child(remotes, XML_CIB_TAG_NODE, update_conn_host_cache, NULL);
 359         }
 360 
 361     } else {
 362         crm_err("Could not confirm join-%d with %s: Local operation history "
 363                 "failed", join_id, controld_globals.dc_name);
 364         register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 365     }
 366 }

/* [previous][next][first][last][top][bottom][index][help] */