diff -urN Heartbeat-2-1-STABLE-2.1.4/configure.in Heartbeat-2-1-STABLE-2.1.4.mod/configure.in --- Heartbeat-2-1-STABLE-2.1.4/configure.in 2008-08-18 21:32:19.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/configure.in 2010-05-14 10:29:56.000000000 +0900 @@ -329,6 +329,23 @@ fi done +dnl This OS-based decision-making is poor autotools practice; +dnl feature-based mechanisms are strongly preferred. +dnl +dnl So keep this section to a bare minimum; regard as a "necessary evil". + +case "$host_os" in +*bsd*) + ;; +*solaris*) + ;; +*linux*) + AC_DEFINE_UNQUOTED(ON_LINUX, 1, Compiling for Linux platform) + ;; +darwin*) + ;; +esac + dnl The GNU conventions for installation directories don't always dnl sit well with this software. In particular, GNU's stated: @@ -3017,6 +3034,8 @@ tools/haresources2cib.py \ tools/hb_report \ tools/ocf-tester \ + tools/hb_vmmonitor/Makefile \ + tools/hb_vmmonitor/hb_vmmonitor.spec \ resources/Makefile \ resources/OCF/Makefile \ resources/OCF/.ocf-binaries \ diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/Makefile.am Heartbeat-2-1-STABLE-2.1.4.mod/tools/Makefile.am --- Heartbeat-2-1-STABLE-2.1.4/tools/Makefile.am 2008-08-18 21:32:19.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/Makefile.am 2010-05-14 10:29:56.000000000 +0900 @@ -31,6 +31,9 @@ hanoarchdir = @HA_NOARCHDATAHBDIR@ gliblib = @GLIBLIB@ +SUBDIRS = hb_vmmonitor +DIST_SUBDIRS = hb_vmmonitor + habin_PROGRAMS = cl_status cl_respawn halib_SCRIPTS = haresources2cib.py hanoarch_DATA = utillib.sh diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/Makefile.am Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/Makefile.am --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/Makefile.am 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/Makefile.am 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,70 @@ +# +# heartbeat: Linux-HA heartbeat code +# +# Copyright (C) 2001 Michael Moerz +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +MAINTAINERCLEANFILES = Makefile.in + +INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ + -I$(top_builddir)/linux-ha -I$(top_srcdir)/linux-ha \ + -I$(top_builddir)/libltdl -I$(top_srcdir)/libltdl \ + -I$(top_builddir)/tools -I$(top_srcdir)/tools + +apigid = @HA_APIGID@ +hb_vmmonitordir = $(libdir)/@HB_PKG@/hb-vmmonitor +hanoarchdir = @HA_NOARCHDATAHBDIR@ +gliblib = @GLIBLIB@ + +hb_vmmonitor_PROGRAMS = hb_vmmonitor hb_vmmon_client pingd diskd +hb_vmmonitor_DATA = hb_vmmonitor.conf.sample + +## SOURCES +hb_vmmonitor_SOURCES = hb_vmmonitor.c +hb_vmmonitor_LDADD = \ + $(top_builddir)/lib/clplumbing/libplumb.la \ + $(top_builddir)/lib/crm/common/libcrmcommon.la \ + $(top_builddir)/lib/hbclient/libhbclient.la \ + $(GLIBLIB) \ + $(LIBRT) + +hb_vmmon_client_SOURCES = hb_vmmon_client.c +hb_vmmon_client_LDADD = \ + $(top_builddir)/lib/clplumbing/libplumb.la \ + $(top_builddir)/lib/crm/common/libcrmcommon.la \ + $(top_builddir)/lib/hbclient/libhbclient.la \ + $(GLIBLIB) \ + $(LIBRT) + +pingd_SOURCES = pingd.c +pingd_LDADD = \ + $(top_builddir)/lib/clplumbing/libplumb.la \ + $(top_builddir)/lib/crm/common/libcrmcommon.la \ + $(top_builddir)/lib/hbclient/libhbclient.la \ + $(GLIBLIB) \ + $(LIBRT) + +diskd_SOURCES = diskd.c +diskd_LDADD = \ + $(top_builddir)/lib/clplumbing/libplumb.la \ + $(top_builddir)/lib/crm/common/libcrmcommon.la \ + $(top_builddir)/lib/hbclient/libhbclient.la \ + $(GLIBLIB) \ + $(LIBRT) + +uninstall-local: + rm -fr $(DESTDIR)$(hb_vmmonitordir) + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/diskd.c Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/diskd.c --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/diskd.c 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/diskd.c 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,556 @@ +/* last modified 2009/11/04 OKADA */ + + +/* ------------------------------------------------------------------------- + * diskd --- monitors shared disk. + * This applied pingd mechanism to disk monitor. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 NIPPON TELEGRAPH AND TELEPHONE CORPORATION + * + * ------------------------------------------------------------------------- + */ + +/* for HB300 +#include +*/ + +/* for HB213 or HB214 */ +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#ifdef HAVE_GETOPT_H +# include +#endif + +#define MIN_INTERVAL 1 +#define MAX_INTERVAL 3600 +#define MIN_TIMEOUT 1 +#define MAX_TIMEOUT 3600 +#define MIN_RETRY 0 +#define MAX_RETRY 10 +#define MIN_RETRY_INTERVAL 1 +#define MAX_RETRY_INTERVAL 3600 +#define ERROR -1 +#define normal 1 +#define BLKFLSBUF _IO(0x12,97) /* flush buffer. refer linux/hs.h */ +#define WRITE_DATA 64 + +/* GMainLoop *mainloop = NULL; */ +const char *crm_system_name = "diskd"; + +#define OPTARGS "N:wd:a:i:p:PDV?t:r:I:" + +IPC_Channel *attrd = NULL; +GMainLoop* mainloop = NULL; +const char *diskd_attr = "diskd"; + +const char *device = NULL; /* device name for disk check */ +char *wfile = NULL; /* directory name for disk check (write) 2008.10.24 */ +gboolean wflag = FALSE; +gboolean periodic_flag = FALSE; +int optflag = 0; /* flag for duplicate */ + +int retry = 1; /* disk check retry. default 1 times */ +int retry_interval = 5; /* disk check retry intarval time. default 5sec. */ +int interval = 30; /* disk check interval. default 30sec.*/ +int timeout = 30; /* disk check read func timeout. default 30sec. */ +int old_status = 0; +const char *diskcheck_value = NULL; +int pagesize = 0; +void *ptr = NULL; +void *buf; + +void send_update(void); + +static gboolean +diskd_shutdown(int nsig, gpointer unused) +{ + crm_info("Exiting"); + + if (mainloop != NULL && g_main_is_running(mainloop)) { + g_main_quit(mainloop); + } else { + exit(0); + } + return FALSE; +} + +static void +usage(const char *cmd, int exit_status) +{ + FILE *stream; + + stream = exit_status ? stderr : stdout; + + fprintf(stream, "usage: %s (-N|-w) [-daipPDV?trI]\n", cmd); + fprintf(stream, " Basic options\n"); + fprintf(stream, "\t--%s (-%c) \tDevice name to read\n" + "\t\t\t\t\t* Required option\n", "read-device-name", 'N'); + fprintf(stream, "\t--%s (-%c) \t\t\tWrite check for disk, in /tmp/diskcheck\n" + "\t\t\t\t\t* Required option\n", "write-check", 'w'); + fprintf(stream, "\t--%s (-%c) \tDirectory Name to write\n" + , "write-directory-name", 'd'); + fprintf(stream, "\t--%s (-%c) \tName of the node attribute to set\n" + "\t\t\t\t\t* Default=diskd\n", "attr-name", 'a'); + fprintf(stream, "\t--%s (-%c) \tDisk status check interval time\n" + "\t\t\t\t\t* Default=30 sec.\n", "interval", 'i'); + fprintf(stream, "\t--%s (-%c) \tFile in which to store the process' PID\n" + "\t\t\t\t\t* Default=/tmp/diskd.pid\n", "pid-file", 'p'); + fprintf(stream, "\t--%s (-%c) \t\tUpdate a value regularly\n", "periodic-update", 'P'); + fprintf(stream, "\t--%s (-%c) \t\tRun in daemon mode\n", "daemonize", 'D'); + fprintf(stream, "\t--%s (-%c) \t\t\tRun in verbose mode\n", "verbose", 'V'); + fprintf(stream, "\t--%s (-%c) \t\t\tThis text\n", "help", '?'); + fprintf(stream, " Note: -N, -w options cannot be specified at the same time.\n\n"); + fprintf(stream, " Advanced options\n"); + fprintf(stream, "\t--%s (-%c) \tDisk status check timeout for select function\n" + "\t\t\t\t\t* Default=30 sec.\n", "check-timeout", 't'); + fprintf(stream, "\t--%s (-%c) \t\tDisk status check retry\n" + "\t\t\t\t\t* Default=1 times\n", "retry", 'r'); + fprintf(stream, "\t--%s (-%c) \tDisk status check retry interval time\n" + "\t\t\t\t\t* Default=5 sec.\n", "retry-interval", 'I'); + + fflush(stream); + + exit(exit_status); +} + +static gboolean +check_old_status(int new_status) +{ + + if (new_status != ERROR && new_status != normal) { + crm_warn("non-defined status, new_status = %d", new_status); + return FALSE; + } + + if (old_status != new_status) { + if (new_status == ERROR) { + diskcheck_value = "ERROR"; + } else { + diskcheck_value = "normal"; + } + crm_warn("disk status is changed, new_status = %s", diskcheck_value); + send_update(); + old_status = new_status; + return TRUE; + } + + if (periodic_flag) { + crm_debug("disk status send update, status = %s", diskcheck_value); + send_update(); + } + + return TRUE; +} + +static int diskcheck_wt(gpointer data) +{ + int fd = -1; + int err, i; + int select_err; + struct timeval timeout_tv; + fd_set write_fd_set; + + crm_debug_2("diskcheck_wt start"); + for (i = 0; i <= retry; i++) { + if ( i !=0 ) { + sleep(retry_interval); + } + + /* file open */ + fd = open(wfile, O_WRONLY | O_CREAT | O_DSYNC | O_NONBLOCK, 0); + + if (fd == -1) { + crm_err("Could not open %s", wfile); + cl_perror("%s", wfile); + continue; /* failed to open file. try re-open */ + } + + while( 1 ) { + err = write(fd, buf, WRITE_DATA); /* data write */ + if (err == WRITE_DATA) { + crm_debug_2("data writing is OK"); + close(fd); + if (-1 == remove((const char *)wfile)) { + crm_warn("failed to remove file %s", wfile); + } + check_old_status(normal); + return normal; /* OK */ + } else if (err != WRITE_DATA && errno == EAGAIN) { + crm_warn("write function return errno:EAGAIN"); + FD_ZERO(&write_fd_set); + FD_SET(fd, &write_fd_set); + timeout_tv.tv_sec = timeout; + timeout_tv.tv_usec = 0; + select_err = select(fd+1, NULL, &write_fd_set, NULL, + &timeout_tv); + if (select_err == 1) { + crm_warn("select ok, write again"); + continue; /* retly write */ + } else if (select_err == -1) { + crm_err("select failed on file %s", wfile); + close(fd); + if (-1 == remove((const char *)wfile)) { + crm_warn("failed to remove file %s", wfile); + } + break; /* failed to select */ + } else { + crm_err("select time out on file %s", wfile); + close(fd); + if (-1 == remove((const char *)wfile)) { + crm_warn("failed to remove file %s", wfile); + } + break; /* failed to select */ + } + } else { + crm_err("Could not write to file %s", wfile); + cl_perror("%s", wfile); + close(fd); + if (-1 == remove((const char *)wfile)) { + crm_warn("failed to remove file %s", wfile); + } + break; /* failed to write */ + } + } + } + /* after for loop */ + + crm_warn("Error(s) occurred in diskcheck_wt function."); + check_old_status(ERROR); + + return ERROR; + + /* file close */ + +} +static int diskcheck(gpointer data) +{ + int i; + int fd = -1; + int err; + int select_err; + struct timeval timeout_tv; + fd_set read_fd_set; + + crm_debug_2("diskcheck start"); + + for (i = 0; i <= retry; i++) { + if ( i !=0 ) { + sleep(retry_interval); + } + + fd = open((const char *)device, O_RDONLY | O_NONBLOCK, 0); + if (fd == -1) { + crm_err("Could not open device %s", device); + continue; + } + + err = ioctl(fd, BLKFLSBUF, 0); + if (err != 0) { + crm_err("iotcl error, Could not flush baffer"); + close(fd); + continue; + } + + while( 1 ) { + err = read(fd, buf, pagesize); + if (err == pagesize) { + crm_debug_2("reading form data is OK"); + close(fd); + check_old_status(normal); + return normal; + } else if (err != pagesize && errno == EAGAIN) { + crm_warn("read function return errno:EAGAIN"); + FD_ZERO(&read_fd_set); + FD_SET(fd, &read_fd_set); + timeout_tv.tv_sec = timeout; + timeout_tv.tv_usec = 0; + select_err = select(fd+1, &read_fd_set, NULL, NULL, &timeout_tv); + if (select_err == 1) { + crm_warn("select ok, read again"); + continue; + } else if (select_err == -1) { + crm_err("select failed on device %s", device); + close(fd); + break; + } + } else { + crm_err("Could not read from device %s", device); + close(fd); + break; + } + } + } + + crm_warn("Error(s) occurred in diskcheck function."); + check_old_status(ERROR); + + return ERROR; +} + + +int +main(int argc, char **argv) +{ + int lpc; + int argerr = 0; + int flag; + char *pid_file = NULL; + gboolean daemonize = FALSE; + +#ifdef HAVE_GETOPT_H + int option_index = 0; + static struct option long_options[] = { + /* Top-level Options */ + {"verbose", 0, 0, 'V'}, + {"help", 0, 0, '?'}, + {"pid-file", 1, 0, 'p'}, + {"attr-name", 1, 0, 'a'}, + {"read-device-name", 1, 0, 'N'}, + {"daemonize", 0, 0, 'D'}, + {"interval", 1, 0, 'i'}, + {"retry", 1, 0, 'r'}, + {"retry-interval", 1, 0, 'I'}, + {"check-timeout", 1, 0, 't'}, + {"write-check", 0, 0, 'w'}, /* add option 2008.10.24 */ + {"write-directory-name", 1, 0, 'd'}, /* add option 2009.4.17 */ + {"periodic-update", 0, 0, 'P'}, /* add option 2009.4.23 */ + + {0, 0, 0, 0} + }; +#endif + pid_file = crm_strdup("/tmp/diskd.pid"); + crm_system_name = basename(argv[0]); + + G_main_add_SignalHandler( + G_PRIORITY_HIGH, SIGTERM, diskd_shutdown, NULL, NULL); + + crm_log_init(basename(argv[0]), LOG_INFO, TRUE, FALSE, argc, argv); + + /* check user. user shuld be root.*/ + if (strcmp("root", (const gchar *)g_get_user_name()) != 0) { + crm_err("permission denied. diskd should be executed by root.\n"); + printf ("permission denied. diskd should be executed by root.\n"); + exit(LSB_EXIT_GENERIC); + } + + while (1) { +#ifdef HAVE_GETOPT_H + flag = getopt_long(argc, argv, OPTARGS, + long_options, &option_index); +#else + flag = getopt(argc, argv, OPTARGS); +#endif + if (flag == -1) + break; + + switch(flag) { + case 'V': + cl_log_enable_stderr(TRUE); + alter_debug(DEBUG_INC); + break; + case 'p': + pid_file = crm_strdup(optarg); + break; + case 'a': + diskd_attr = crm_strdup(optarg); + break; + case 'r': + retry = crm_parse_int(optarg, "1"); + if ((retry == 0) && (strcmp(optarg, "0") != 0)) { + argerr++; + break; + } + if ((retry < MIN_RETRY) || (retry > MAX_RETRY)) + ++argerr; + break; + case 'I': + retry_interval = crm_parse_int(optarg, "1"); + if ((retry_interval < MIN_RETRY_INTERVAL) || (retry_interval > MAX_RETRY_INTERVAL)) + ++argerr; + break; + case 'i': + interval = crm_parse_int(optarg, "1"); + if ((interval < MIN_INTERVAL) || (interval > MAX_INTERVAL)) + ++argerr; + break; + case 't': + timeout = crm_parse_int(optarg, "1"); + if ((timeout < MIN_TIMEOUT) || (timeout > MAX_TIMEOUT)) + ++argerr; + break; + case 'N': + device = crm_strdup(optarg); + optflag++; /* add 2008.20.24 */ + break; + case 'D': + daemonize = TRUE; + break; + case 'w': /* add option 2008.10.24 */ + wflag = TRUE; + optflag++; + break; + case 'd': /* add option 2009.4.17 */ + crm_malloc0(wfile, PATH_MAX); + g_snprintf(wfile, PATH_MAX, "%s/diskcheck", optarg); + break; + case 'P': /* add option 2009.4.23 */ + periodic_flag = TRUE; + break; + case '?': + usage(crm_system_name, LSB_EXIT_GENERIC); + break; + default: + printf ("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + crm_err("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + ++argerr; + break; + } + } + + if (optind < argc) { + crm_err("non-option ARGV-elements: "); + printf ("non-option ARGV-elements: "); + while (optind < argc) { + crm_err("%s ", argv[optind]); + printf("%s ", argv[optind]); + optind++; + } + printf("\n"); + argerr ++; + } + if ((argerr) || (optflag >= 2) || (device == NULL && wflag == FALSE)) { /* add optflag 2008.10.24 */ + /* "-N" + "-w" pattern and not "-N" + not "-w"*/ + usage(crm_system_name, LSB_EXIT_GENERIC); + } + if ((device != NULL) && (wfile != NULL)) { + /* "-N" + "-d" pattern */ + crm_warn("\"d\" option was ignored, because N option was specified."); + } + + crm_make_daemon(crm_system_name, daemonize, pid_file); + + for(lpc = 0; attrd == NULL && lpc < 30; lpc++) { + crm_debug("hb_vmmonitor registration attempt: %d", lpc); + sleep(5); + attrd = init_client_ipc_comms_nodispatch("hb_vmmonitor"); + } + + if(attrd == NULL) { + printf ("hb_vmmonitor registration failed\n"); + crm_err("hb_vmmonitor registration failed"); + cl_flush_logs(); + exit(LSB_EXIT_GENERIC); + } + + + if ( wflag ) { /* writer */ + if (wfile == NULL) { + crm_malloc0(wfile, PATH_MAX); + g_snprintf(wfile, PATH_MAX, "/tmp/diskcheck"); + } + buf = (void *)malloc(WRITE_DATA); + if (buf == NULL) { + crm_err("Could not allocate memory"); + check_old_status(ERROR); + exit(LSB_EXIT_GENERIC); + } + diskcheck_wt(NULL); + Gmain_timeout_add(interval*1000, diskcheck_wt, NULL); + } else { /* reader */ + pagesize = getpagesize(); + ptr = (void *)malloc(2 * pagesize); + if (ptr == NULL) { + crm_err("Could not allocate memory"); + check_old_status(ERROR); + exit(LSB_EXIT_GENERIC); + } + buf = (void *)(((u_long)ptr + pagesize) & ~(pagesize-1)); + diskcheck(NULL); + Gmain_timeout_add(interval*1000, diskcheck, NULL); + } + + crm_info("Starting %s", crm_system_name); + mainloop = g_main_new(FALSE); + g_main_run(mainloop); + + free(ptr); + crm_free(pid_file); + if (wfile != NULL) { + crm_free(wfile); + } + crm_info("Exiting %s", crm_system_name); + return 0; +} + +void +send_update() +{ + HA_Message *update = ha_msg_new(4); + ha_msg_add(update, F_TYPE, "hb_vmmonitor"); + ha_msg_add(update, F_ORIG, crm_system_name); + ha_msg_add(update, F_ATTRD_TASK, "update"); + ha_msg_add(update, F_ATTRD_ATTRIBUTE, diskd_attr); + + ha_msg_add(update, F_ATTRD_VALUE, diskcheck_value); + + if(send_ipc_message(attrd, update) == FALSE) { + crm_err("Could not send update"); + exit(1); + } + crm_msg_del(update); + +/* for HB300 + xmlNode *update = create_xml_node(NULL, __FUCTION__); + crm_xml_add(update, F_TYPE, "hb_vmmonitor"); + crm_xml_add(update, F_ORIG, crm_system_name); + crm_xml_add(update, F_ATTRD_TASK, "update"); + crm_xml_add(update, F_ATTRD_ATTRIBUTE, diskd_attr); + + crm_xml_add(update, F_ATTRD_VALUE, diskcheck_value); + + if(send_ipc_message(attrd, update) == FALSE) { + crm_err("Could not send update"); + exit(1); + } + free_xml(update); +*/ +} diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmon_client.c Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmon_client.c --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmon_client.c 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmon_client.c 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,678 @@ +/* + * hb_vmmon_client: Heartbeat Device Status Monitor Client for Virtual Machine + * + * Copyright (C) 2009 NIPPON TELEGRAPH AND TELEPHONE CORPORATION + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#ifdef HAVE_GETOPT_H +# include +#endif + +#define OPTARGS "s:p:a:i:r:t:P:DV?" +#define MIN_INTERVAL 1 +#define MAX_INTERVAL 86400 +#define DEFAULT_INTERVAL 10 +#define MIN_TIMEOUT 1 +#define MAX_TIMEOUT 30 +#define DEFAULT_TIMEOUT 5 +#define MIN_RETRY 0 +#define MAX_RETRY 10 +#define DEFAULT_RETRY 0 +#define MAX_WARN_CNT 10 +#define MIN_PORTNUM 1024 +#define MAX_PORTNUM 65535 +#define ATTRNAME_PREFIX "vmmon" +#define STATE_CONN_ERROR "connerror" +#define STATE_ATTR_ERROR "attrerror" +#define STATE_NORMAL "normal" +#define MAX_BUFFLEN 256 + +enum vmmon_host_state +{ + INIT, + NORMAL, + MONSTOP, + SHUTDOWN +}; + +typedef struct host_s +{ + int fd; /* socket */ + char *port; + char *host_name; +} host_t; + +typedef struct send_msg_s +{ + char attrName[MAX_BUFFLEN]; + +} send_msg_t; + +typedef struct recv_msg_s +{ + int vmmon_host_state; + int check_pid; + char attrName[MAX_BUFFLEN]; + char attrValue[MAX_BUFFLEN]; + +} recv_msg_t; + +typedef struct device_info_s +{ + char attrName[MAX_BUFFLEN]; + char attrValue[MAX_BUFFLEN]; + char lastAttrValue[MAX_BUFFLEN]; + char client_stateAttrName[MAX_BUFFLEN]; + char client_state[MAX_BUFFLEN]; + char last_client_state[MAX_BUFFLEN]; + +} device_info_t; + + +/* GMainLoop *mainloop = NULL; */ +const char *crm_system_name = "hb_vmmon_client"; + +IPC_Channel *attrd = NULL; +GMainLoop* mainloop = NULL; +device_info_t *devinfo = NULL; +host_t *dom0host = NULL; +int interval = DEFAULT_INTERVAL; /* hb_vmmon_client check interval. default 10sec */ +int timeout = DEFAULT_TIMEOUT; /* hb_vmmon_client check msg recive timeout. default 5sec */ +int retry = DEFAULT_RETRY; /* hb_vmmon_client check retry. default is do not retry */ +int retry_interval = 5; /* hb_vmmon_client check retry intarval time. anytime 5sec. */ +int warn_count = 0; + +static gboolean check_old_status(void); +static gboolean statuscheck(gpointer data); +static int connect_host(host_t *host); +static int communicate_server(gpointer data); +void send_update(char* attrName, char* attrValue); + +static gboolean +hb_vmmon_client_shutdown(int nsig, gpointer unused) +{ + crm_info("Exiting"); + + if (mainloop != NULL && g_main_loop_is_running(mainloop)) { + g_main_loop_quit(mainloop); + } else { + exit(0); + } + return FALSE; +} + +static void +usage(const char *cmd, int exit_status) +{ + FILE *stream; + + stream = exit_status ? stderr : stdout; + + fprintf(stream, "usage: %s -s -p -a [-irtPVD?]\n", cmd); + fprintf(stream, " Basic options\n"); + fprintf(stream, "\t--%s (-%c) \tIP address of hb_vmmonitor server\n" + "\t\t\t\t\t* required option\n", "server-address", 's'); + fprintf(stream, "\t--%s (-%c) \tPort number for hb_vmmonitor server (%d - %d)\n" + "\t\t\t\t\t* required option\n", "server-port", 'p', MIN_PORTNUM, MAX_PORTNUM); + fprintf(stream, "\t--%s (-%c) \tAttribute name which represents a device's status\n" + "\t\t\t\t\t* required option\n", "attr-name", 'a'); + fprintf(stream, "\t--%s (-%c) \tDevice status check interval time (%d - %d sec)\n" + "\t\t\t\t\t* Default=%d sec.\n", "interval", 'i', MIN_INTERVAL, MAX_INTERVAL, DEFAULT_INTERVAL); + fprintf(stream, "\t--%s (-%c) \tThe number of times to reconnect to a hb_vmmonitor server (%d - %d times)\n" + "\t\t\t\t\t* Default=%d times.\n", "retry", 'r', MIN_RETRY, MAX_RETRY, DEFAULT_RETRY); + fprintf(stream, "\t--%s (-%c) \tTimeout to wait a reply from hb_vmmonitor server (%d - %d sec)\n" + "\t\t\t\t\t* Default=%d sec.\n", "recv-timeout", 't', MIN_TIMEOUT, MAX_TIMEOUT, DEFAULT_TIMEOUT); + fprintf(stream, "\t--%s (-%c) \t\t\tRun in verbose mode\n", "verbose", 'V'); + fprintf(stream, "\t--%s (-%c) \tFile in which to store the process' PID\n" + "\t\t\t\t\t* Default=/tmp/hb_vmmon_client.pid\n", "pid-file", 'P'); + fprintf(stream, "\t--%s (-%c) \t\tRun in daemon mode\n", "daemonize", 'D'); + fprintf(stream, "\t--%s (-%c) \t\t\tThis text\n", "help", '?'); + + fflush(stream); + + exit(exit_status); +} + +static gboolean +check_old_status() +{ + /* check monitor process attribute value. */ + if (safe_str_neq(devinfo->lastAttrValue, devinfo->attrValue)) { + crm_info("attribute value of attribute name[%s] is updated: [%s] => [%s].", + devinfo->attrName, devinfo->lastAttrValue, devinfo->attrValue); + send_update(devinfo->attrName, devinfo->attrValue); + /* change to last attribute value. */ + g_strlcpy(devinfo->lastAttrValue, devinfo->attrValue, + sizeof(devinfo->lastAttrValue) + 1); + crm_debug_2("Set lastValue to [%s].", devinfo->attrValue); + } else { + crm_debug_2("The value of [%s] does not change from [%s].", + devinfo->attrName, devinfo->lastAttrValue); + } + + /* check client state attribute value. */ + if (safe_str_neq(devinfo->last_client_state, devinfo->client_state)) { + crm_info("attribute value of attribute name[%s] is updated: [%s] => [%s].", + devinfo->client_stateAttrName, + devinfo->last_client_state, devinfo->client_state); + send_update(devinfo->client_stateAttrName, devinfo->client_state); + g_strlcpy(devinfo->last_client_state, devinfo->client_state, + sizeof(devinfo->last_client_state) + 1); + crm_debug_2("Set lastValue to [%s].", devinfo->last_client_state); + } else { + crm_debug_2("The value of [%s] does not change from [%s].", + devinfo->client_stateAttrName, devinfo->last_client_state); + } + + return FALSE; +} + +static gboolean statuscheck(gpointer data) +{ + int i, rc; + + crm_debug("statuscheck start"); + + for (i = 0; i <= retry; i++) { + + if (i != 0) { + crm_debug("reconnect it to a host[%s] %dsec later. retry count=%d", + dom0host->host_name, retry_interval, i); + sleep(retry_interval); + } + + rc = connect_host(dom0host); + if (rc == 0) { + /* success server connect. */ + rc = communicate_server(data); + if (rc == 0) { + /* success server communicate. */ + break; + } + close(dom0host->fd); + } + + switch(rc) { + case -1: + g_strlcpy(devinfo->client_state, STATE_CONN_ERROR, + sizeof(devinfo->client_state) + 1); + break; + case -2: + g_strlcpy(devinfo->client_state, STATE_ATTR_ERROR, + sizeof(devinfo->client_state) + 1); + break; + } + } + + check_old_status(); + close(dom0host->fd); + + return TRUE; +} + +static int connect_host(host_t *host) +{ + int ret_ga; + int lock; + struct addrinfo hints; + struct addrinfo *res = NULL; + struct addrinfo *ai = NULL; + fd_set writefds; + struct timeval conn_timeout; + int optval; + socklen_t optlen; + int rc; + + /* for getaddrinfo() */ + bzero(&hints, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICSERV; + + ret_ga = getaddrinfo(host->host_name, host->port, &hints, &res); + + if (ret_ga) { + crm_warn("failed to get host's network address information.: %s", + gai_strerror(ret_ga)); + freeaddrinfo(res); /* No longer needed */ + return -1; + } + + /* select to can connect protcol */ + for (ai=res; ai; ai=ai->ai_next) { + + host->fd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + + if(host->fd < 0) { + crm_err("failed to open socket to %s", host->host_name); + continue; + } + + /* set non-blocking mode */ + if ((lock = fcntl(host->fd, F_GETFL, NULL)) < 0) { + cl_perror("fcntl(F_GETFL) call failed"); + /* failed to connect. */ + goto cleanup_close; + } + if (fcntl(host->fd, F_SETFL, lock | O_NONBLOCK) < 0) { + cl_perror("fcntl(F_SETFL) call failed"); + /* failed to connect. */ + goto cleanup_close; + } + + /* trying to connect with timeout */ + if (connect(host->fd, res->ai_addr, res->ai_addrlen) < 0) { + if (errno != EINPROGRESS) { + cl_perror("connect(2) call failed"); + /* failed to connect. */ + goto cleanup_close; + } + crm_debug_2("EINPROGRESS in connect(). selecting..."); + + conn_timeout.tv_sec = timeout; + conn_timeout.tv_usec = 0; + FD_ZERO(&writefds); + FD_SET(host->fd, &writefds); + + rc = select(host->fd+1, NULL, &writefds, NULL, &conn_timeout); + if (rc < 0) { + if (errno == EINTR) { + crm_warn("select(2) call failed: %s", + g_strerror(errno)); + /* failed to connect. */ + goto cleanup_close; + } + cl_perror("select(2) call failed"); + goto cleanup_close; + } else if (rc == 0) { + crm_info("failed to connect (timed out)"); + /* failed to connect. */ + goto cleanup_close; + } + + /* socket selected for write */ + optlen = sizeof(int); + if (getsockopt(host->fd, SOL_SOCKET, + SO_ERROR, &optval, &optlen) < 0) { + cl_perror("getsockopt(2) call failed"); + /* failed to connect. */ + goto cleanup_close; + } + if (optval) { + crm_warn("failed to connect to %s (port:%s): %s", + host->host_name, host->port, g_strerror(optval)); + /* failed to connect. */ + goto cleanup_close; + } + /* connect success!! */ + break; + + } else { + /* connect success!! */ + break; + } + cleanup_close: + close(host->fd); + } + + if (ai == NULL) { + crm_err("Could not connect"); + freeaddrinfo(res); /* No longer needed */ + return -1; + } + + freeaddrinfo(res); /* No longer needed */ + crm_debug("Opened connection to %s", host->host_name); + + return 0; +} + +static int +communicate_server(gpointer data) +{ + int recvSize = 0; + send_msg_t *sendMsg = (send_msg_t *)data; + recv_msg_t *recvMsg = NULL; + fd_set rfds; + int select_ret; + struct timeval rcv_timeout; + + /* connection succeeded. next send to massege. */ + crm_debug_3("send msg [%s]", sendMsg->attrName); + + if (-1 == send(dom0host->fd, sendMsg, sizeof(send_msg_t), + MSG_DONTWAIT|MSG_NOSIGNAL)) { + cl_perror("failed to send message"); + return -1; + } + + rcv_timeout.tv_sec = timeout; + rcv_timeout.tv_usec = 0; + FD_ZERO(&rfds); + FD_SET(dom0host->fd, &rfds); + select_ret = select(FD_SETSIZE, &rfds, (fd_set *)NULL, + (fd_set *)NULL, &rcv_timeout); + + if (select_ret == -1) { + /* failure of failure of select that is the connection */ + cl_perror("failed to select()"); + return -1; + } else if (select_ret) { + /* ready to receive a message. */ + crm_malloc0(recvMsg, sizeof(recv_msg_t)); + + if ((recvSize = recv(dom0host->fd, recvMsg, sizeof(recv_msg_t), 0)) == -1) { + cl_perror("failed to recive message"); + crm_free(recvMsg); + return -1; + } + + if (recvSize == 0) { + crm_err("attribute name[%s], that is not registered with a server.", + sendMsg->attrName); + crm_free(recvMsg); + return -2; + } + + crm_debug_3("monitor process pid [%d] attrval = [%s] msgsize = [%d]", + recvMsg->check_pid, recvMsg->attrValue, recvSize); + + if (safe_str_neq(devinfo->attrName, recvMsg->attrName)) { + /* normal does not happen */ + crm_err("recv attribute name[%s] is not equal managemant" + " attribute name[%s].", + recvMsg->attrName, devinfo->attrName); + crm_free(recvMsg); + return -2; + } + + if (safe_str_neq(recvMsg->attrValue, "") && recvMsg->check_pid > 0) { + /* success pattern */ + g_strlcpy(devinfo->attrValue, recvMsg->attrValue, + sizeof(devinfo->attrValue) + 1); + g_strlcpy(devinfo->client_state, STATE_NORMAL, + sizeof(devinfo->client_state) + 1); + warn_count = 0; + + } else if (recvMsg->vmmon_host_state != NORMAL) { + /* + * hb_vmmonitor is not monitoring the device. + * During it re-reads config file or something. + */ + if (recvMsg->vmmon_host_state == MONSTOP) { + crm_debug("hb_vmmonitor reads a config file again."); + } else if (recvMsg->vmmon_host_state == SHUTDOWN) { + crm_debug("hb_vmmonitor is shutting it down now."); + } + warn_count = 0; + + } else { + + if (safe_str_eq(recvMsg->attrValue, "") && recvMsg->check_pid > 0) { + crm_debug("monitoring process is running. but" + " it does not get device status yet."); + + warn_count++; + if (warn_count >= MAX_WARN_CNT) { + crm_warn("The setting of the monitor process[%s], may be wrong.", + recvMsg->attrName); + warn_count = MAX_WARN_CNT; + } + } + + if (recvMsg->check_pid == 0) { + crm_debug("monitoring process is not running."); + } + + crm_info("The device of attribute name[%s] is not yet monitored.", + recvMsg->attrName); + } + crm_free(recvMsg); + + } else { + /* connection is timeout. */ + crm_err("failed to receive message from hb_vmmonitor (timed out)."); + return -1; + } + + return 0; +} + +int +main(int argc, char **argv) +{ + int lpc; + int argerr = 0; + int flag; + char *pid_file = NULL; + gboolean daemonize = FALSE; + send_msg_t *sendMsg = NULL; + +#ifdef HAVE_GETOPT_H + int option_index = 0; + static struct option long_options[] = { + /* Top-level Options */ + {"verbose", 0, 0, 'V'}, + {"pid-file", 1, 0, 'P'}, + {"attr-name", 1, 0, 'a'}, + {"interval", 1, 0, 'i'}, + {"retry", 1, 0, 'r'}, + {"recv-timeout", 1, 0, 't'}, + {"server-address", 1, 0, 's'}, + {"server-port", 1, 0, 'p'}, + {"daemonize", 0, 0, 'D'}, + {"help", 0, 0, '?'}, + + {0, 0, 0, 0} + }; +#endif + pid_file = g_strdup("/tmp/hb_vmmon_client.pid"); + crm_system_name = basename(argv[0]); + crm_malloc0(dom0host, sizeof(host_t)); + dom0host->port = NULL; + crm_malloc0(sendMsg, sizeof(send_msg_t)); + crm_malloc0(devinfo, sizeof(device_info_t)); + g_strlcpy(devinfo->client_state, STATE_NORMAL, + sizeof(devinfo->client_state) + 1); + + G_main_add_SignalHandler( + G_PRIORITY_HIGH, SIGTERM, hb_vmmon_client_shutdown, NULL, NULL); + + crm_log_init(basename(argv[0]), LOG_INFO, TRUE, FALSE, argc, argv); + + /* check user. user shuld be root.*/ + if (strcmp("root", (const gchar *)g_get_user_name()) != 0) { + crm_err("permission denied. hb_vmmon_client should be executed by root.\n"); + printf ("permission denied. hb_vmmon_client should be executed by root.\n"); + exit(LSB_EXIT_GENERIC); + } + + while (1) { +#ifdef HAVE_GETOPT_H + flag = getopt_long(argc, argv, OPTARGS, long_options, &option_index); +#else + flag = getopt(argc, argv, OPTARGS); +#endif + if (flag == -1) + break; + + switch(flag) { + case 'V': + cl_log_enable_stderr(TRUE); + alter_debug(DEBUG_INC); + break; + case 'P': + g_free(pid_file); + pid_file = g_strdup(optarg); + break; + case 'a': + g_strlcpy(sendMsg->attrName, optarg, + sizeof(sendMsg->attrName) + 1); + g_strlcpy(devinfo->attrName, optarg, + sizeof(devinfo->attrName) + 1); + g_snprintf(devinfo->client_stateAttrName, + sizeof(devinfo->client_stateAttrName), "%s_%s", + ATTRNAME_PREFIX, optarg); + break; + case 'i': + /* can set check interval for 1~3600 */ + interval = crm_parse_int(optarg, "1"); + if ((interval < MIN_INTERVAL) || (interval > MAX_INTERVAL)) + ++argerr; + break; + case 'r': + /* can set retry count for 0~10 */ + retry = crm_parse_int(optarg, "0"); + if ((retry == 0) && (strcmp(optarg, "0") != 0)) { + argerr++; + break; + } + if ((retry < MIN_RETRY) || (retry > MAX_RETRY)) + ++argerr; + break; + case 't': + /* can set recive timeout for 1~30sec */ + timeout = crm_parse_int(optarg, "1"); + if ((timeout < MIN_TIMEOUT) || (timeout > MAX_TIMEOUT)) + ++argerr; + break; + case 's': + dom0host->host_name = g_strdup(optarg); + break; + case 'p': + /* can set server port number for 1024~65535 */ + if (crm_atoi(optarg, "0") < MIN_PORTNUM || + crm_atoi(optarg, "0") > MAX_PORTNUM) { + usage(crm_system_name, 1); + } + dom0host->port = g_strdup(optarg); + break; + case 'D': + daemonize = TRUE; + break; + case '?': + usage(crm_system_name, LSB_EXIT_GENERIC); + break; + default: + printf ("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + crm_err("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + ++argerr; + break; + } + } + + if (optind < argc) { + crm_err("non-option ARGV-elements: "); + printf ("non-option ARGV-elements: "); + + while (optind < argc) { + crm_err("%s ", argv[optind]); + printf("%s ", argv[optind]); + optind++; + } + + printf("\n"); + argerr ++; + } + + if ((argerr) + || dom0host->host_name == NULL + || dom0host->port == NULL + || safe_str_eq(sendMsg->attrName, "")) { + usage(crm_system_name, LSB_EXIT_GENERIC); + } + + crm_make_daemon(crm_system_name, daemonize, pid_file); + + for(lpc = 0; attrd == NULL && lpc < 30; lpc++) { + crm_debug("attrd registration attempt: %d", lpc); + sleep(5); + attrd = init_client_ipc_comms_nodispatch(T_ATTRD); + } + + if(attrd == NULL) { + printf ("attrd registration failed\n"); + crm_err("attrd registration failed"); + cl_flush_logs(); + exit(LSB_EXIT_GENERIC); + } + + /* Get initial device status */ + statuscheck(sendMsg); + + /* Check device status every interval sec. */ + Gmain_timeout_add(interval*1000, statuscheck, sendMsg); + + crm_info("Starting %s", crm_system_name); + mainloop = g_main_loop_new(NULL, FALSE); + g_main_loop_run(mainloop); + + /* After destroy the mainloop. clean up dynamic memories. */ + g_free(pid_file); + g_free(dom0host->host_name); + g_free(dom0host->port); + crm_free(dom0host); + crm_free(sendMsg); + + crm_info("Exiting %s", crm_system_name); + + return 0; +} + +void +send_update(char* attrName, char* attrValue) +{ + HA_Message *update = ha_msg_new(4); + ha_msg_add(update, F_TYPE, T_ATTRD); + ha_msg_add(update, F_ORIG, crm_system_name); + ha_msg_add(update, F_ATTRD_TASK, "update"); + ha_msg_add(update, F_ATTRD_ATTRIBUTE, attrName); + + ha_msg_add(update, F_ATTRD_VALUE, attrValue); + + if(send_ipc_message(attrd, update) == FALSE) { + crm_err("failed to send update to attrd."); + exit(1); + } + + crm_msg_del(update); +} + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.c Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.c --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.c 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.c 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,1880 @@ +/* + * hb_vmmonitor: Heartbeat Device Status Monitor for Virtual Machine + * + * Copyright (C) 2009 NIPPON TELEGRAPH AND TELEPHONE CORPORATION + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_GETOPT_H +# include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define OPTARGS "p:c:VD?" +#define EXECCMDLINE_KEYWORD "exec_cmdline" +#define TIMEOUT_KEYWORD "timeout" +#define CHLD_PIDFILE_DIR "/var/run/heartbeat/hb_vmmonitor/" +#define PIDFILE_PATH "/tmp/hb_vmmonitor.pid" +#define CONFFILE_PATH "/etc/hb_vmmonitor.conf" +#define MIN_PORTNUM 1024 +#define MAX_PORTNUM 65535 +#define NORESTART_EXITCODE 100 +#define MAX_BUFFLEN 256 +#define RECV_TIMEOUT 300 +#define SIGKILL_TIMEOUT 10 +#define MAX_SHORTRCOUNT 10 +#define SIGZERO 0 +#define URWXGRXORX S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH + +/* + * prototype declaration. + */ +static gboolean vmmon_sig_shutdown(int nsig, gpointer unused); +static void vmmon_shutdown(void); +static void create_device_tables(GKeyFile *confObj); +static GKeyFile *create_config_object(const char *confFile); +static void vmmon_setting_t_new(void); +static struct device_t *device_t_new(void); +static void free_vmmon_setting(void); +static void free_monitor_device(gpointer data); +static void start_monproc_wrapper(gpointer key, + gpointer value, gpointer userdata); +static int start_monitor_process(gpointer data); +static gboolean timeout_freeze_process(gpointer userdata); +static gboolean create_pid_file(int pid, const char *prgName); +static gboolean check_chldproc_lives(void); +static void stop_monitor_process(gpointer key, + gpointer value, gpointer userdata); +static gboolean stop_check_monitor_process(gpointer userdata); +static gboolean check_live_process(int pid, const char *prgName); +/* Functions for handling the child quit/abort event */ +static void monitoredProcessDied(ProcTrack* p, int status, int signo, + int exitcode, int waslogged); +static void monitoredProcessRegistered(ProcTrack* p); +static const char * monitoredProcessName(ProcTrack* p); +static void setproctimeouts(ProcTrackKillInfo *killseq, pid_t pid); +static int create_socket_do_listen(void); +static gboolean on_listen(GIOChannel *source, + GIOCondition condition, gpointer data); +static gboolean vmmon_ipc_connect(IPC_Channel *channel, gpointer user_data); +static gboolean vmmon_ipc_callback(IPC_Channel *client, gpointer user_data); +void vmmon_local_callback(HA_Message * msg); +static void vmmon_connection_destroy(gpointer user_data); +static gboolean vmmon_refresh(int nsig, gpointer unused); +/* debug function +static gboolean print_debug(int nsig, gpointer unused); +static void debug_write_log( + gpointer key, gpointer value, gpointer userdata); +static void debug_write_file( + gpointer key, gpointer value, gpointer userdata); +*/ + +/* + * callbacks for ProcTrack. + */ +static ProcTrack_ops MonitoredProcessTrackOps = +{ + monitoredProcessDied, + monitoredProcessRegistered, + monitoredProcessName +}; + +/* + * Status of hb_vmmonitor daemon. + */ +enum vmmon_host_state +{ + INIT, + NORMAL, + MONSTOP, + SHUTDOWN +}; + +/* + * Basic settings and status table for hb_vmmonitor. + */ +typedef struct setting_s +{ + int pid; + int state; + char *port; + char *pidFile; + char *confFile; + GArray *sockarray; + +} setting_t; + +/* + * channel info for connect with monitoring process. + * "monitoring process" means pingd, diskd, etc... + */ +typedef struct lpc_s +{ + guint id; + IPC_Channel *channel; + GCHSource *source; + +} vmmon_client_t; + +/* + * monitoring target device's information table. + * attribute, monitoring process + */ +typedef struct device_s +{ + char *attrName; /* device's id */ + char *attrValue; /* device's status */ + char *last_attrValue; /* to check status changed or not */ + + char *prgName; /* program to check device status */ + char *execCmdline; /* cmdline in string format */ + char **splt_execCmdline; /* cmdline in array format */ + ProcTrackKillInfo killseq[2]; /* to kill with ProcTrack */ + pid_t pid; /* monitoring process's pid */ + gboolean respawn; + int respawncount; /* Last time we respawned */ + int shortrcount; /* Count of fast respawns */ + int freeze_check_timer; + int freeze_timer_id; + +} device_t; + +/* + * Message from hb_vmmon_client on DomU. + */ +typedef struct client_msg_s +{ + char attrName[MAX_BUFFLEN]; + +} client_msg_t; + +/* + * Reply Message for hb_vmmon_client. + */ +typedef struct reply_msg_s +{ + int vmmon_host_state; + int check_pid; /* monitoring process's pid */ + char attrName[MAX_BUFFLEN]; + char attrValue[MAX_BUFFLEN]; + +} reply_msg_t; + +setting_t *internal_info = NULL; +GHashTable *monitor_device_hash = NULL; +GMainLoop* mainloop = NULL; +int base_loglevel = LOG_INFO; + +/* + * shutdown with signal. + */ +static gboolean +vmmon_sig_shutdown(int nsig, gpointer unused) +{ + vmmon_shutdown(); + + return FALSE; +} + +/* + * Shutting down process. + */ +static void +vmmon_shutdown() +{ + internal_info->state = SHUTDOWN; + + crm_info("start shutting down."); + + /* stop all monitoring processes. */ + crm_debug("stop all monitoring processes."); + g_hash_table_foreach(monitor_device_hash, stop_monitor_process, NULL); + + /* confirm whether each monitoring process is dead every 1 second. */ + g_timeout_add(1000, stop_check_monitor_process, NULL); + + return; +} + +/* + * callback for when child process (monitoring process) is dead. + * restart the process. + */ +static void +monitoredProcessDied(ProcTrack* p, int status, int signo, + int exitcode, int waslogged) +{ + pid_t new_pid; + device_t *tmpDev = NULL; + char pidfile_path[PATH_MAX]; + + /* find managed chaild process for hash table. */ + tmpDev = g_hash_table_lookup(monitor_device_hash, p->privatedata); + + if (tmpDev == NULL) { + crm_err("failed to get device information for [%s].", + (char *)p->privatedata); + return; + } + + crm_info("monitoring process [%s] (pid=%d) for [%s]" + " exited with exitcode %d.", + tmpDev->prgName, p->pid, tmpDev->attrName, exitcode); + + if (tmpDev->freeze_timer_id != 0) { + /* remove freeze timeout. */ + crm_debug_3("remove freeze check timer event id[%d].", + tmpDev->freeze_timer_id); + + if (g_source_remove(tmpDev->freeze_timer_id) != TRUE) { + crm_warn("failed to remove freeze check timer event id[%d].", + tmpDev->freeze_timer_id); + } + + tmpDev->freeze_timer_id = 0; + } + + /* remove pid file. */ + g_snprintf(pidfile_path, sizeof(pidfile_path), "%s%s.%d", + CHLD_PIDFILE_DIR, tmpDev->prgName, tmpDev->pid); + + if (remove(pidfile_path) != 0) { + cl_perror("failed to remove pid file[%s]", pidfile_path); + } else { + crm_debug("succeeded in remove of pid file[%s].", pidfile_path); + } + + /* + * clear monitoring process's pid. + * pid == 0 means "the device is not monitored." + */ + tmpDev->pid = 0; + g_free(tmpDev->attrValue); + tmpDev->attrValue = NULL; + + if (exitcode == NORESTART_EXITCODE) { + crm_warn("avoid to restart monitoring process [%s] (pid=%d)" + " for [%s], since exit code is [%d].", + tmpDev->prgName, p->pid, tmpDev->attrName, NORESTART_EXITCODE); + g_free(p->killinfo); + g_free(p->privatedata); + reset_proctrack_data(p); + return; + } + + if (internal_info->state == SHUTDOWN || internal_info->state == MONSTOP) { + crm_info("avoid to restart monitoring process [%s] (pid=%d) for " + "[%s], since hb_vmmonitor is shutting down or refresh progress.", + tmpDev->prgName, p->pid, tmpDev->attrName); + g_free(p->killinfo); + g_free(p->privatedata); + reset_proctrack_data(p); + return; + } + + if (tmpDev->respawn) { + longclock_t now = time_longclock(); + longclock_t minticks = msto_longclock(30000); + longclock_t shorttime = add_longclock(p->startticks, minticks); + + tmpDev->respawncount += 1; + + if (cmp_longclock(now, shorttime) < 0) { + tmpDev->shortrcount += 1; + crm_debug_2("monitoring process [%s] for [%s] is count up to" + " shortrcount[%d]", + tmpDev->prgName, tmpDev->attrName, tmpDev->shortrcount); + }else{ + tmpDev->shortrcount = 0; + } + + if (tmpDev->shortrcount > MAX_SHORTRCOUNT) { + crm_err("monitoring process [%s] for [%s] %s over the max count [%d]." , + tmpDev->prgName, tmpDev->attrName, "respawning too fast,", MAX_SHORTRCOUNT); + tmpDev->shortrcount = 0; + }else{ + /* restart child process */ + crm_info("restart monitoring process [%s] for [%s].", + tmpDev->prgName, tmpDev->attrName); + + if (tmpDev->attrValue) { + g_free(tmpDev->attrValue); + tmpDev->attrValue = NULL; + } + + if ((new_pid = start_monitor_process(tmpDev)) > 0) { + tmpDev->pid = new_pid; + + if (tmpDev->freeze_check_timer > 0) { + crm_debug_2("set freeze check timer [%s] for [%s] (pid=%d) is %dsec.", + tmpDev->prgName, tmpDev->attrName, tmpDev->pid, tmpDev->freeze_check_timer); + tmpDev->freeze_timer_id = g_timeout_add( + tmpDev->freeze_check_timer * 1000, timeout_freeze_process, tmpDev); + crm_debug_2("freeze timer event id[%d]", tmpDev->freeze_timer_id); + } + + } else { + crm_err("failed to restart monitoring process [%s] for [%s].", + tmpDev->prgName, tmpDev->attrName); + } + } + } + + /* free dinamic memories. */ + g_free(p->privatedata); + g_free(p->killinfo); + reset_proctrack_data(p); + + return; +} + +/* + * callback for when child process is registered. + */ +static void +monitoredProcessRegistered(ProcTrack* p) +{ + const char *procname = p->ops->proctype(p); + + if (procname != NULL) { + crm_info("monitoring process [%s] for [%s] (pid=%d) started.", + p->ops->proctype(p), (char *)p->privatedata, p->pid); + } + + return; +} + +/* + * ProcTrack->ops->proctype(). + */ +static const char * +monitoredProcessName(ProcTrack* p) +{ + device_t *dev = NULL; + dev = g_hash_table_lookup(monitor_device_hash, p->privatedata); + + if (dev == NULL) { + crm_warn("failed to get device information for [%s].", + (char *)p->privatedata); + return NULL; + } + + return dev->prgName; +} + +/* + * parse config file to get device information. + */ +static void +create_device_tables(GKeyFile *confObj) +{ + int i; + gchar **sections = NULL; + gchar *value = NULL; + gchar *check_value = NULL; + gsize grpsize; + device_t *monitor_device = NULL; + + monitor_device_hash = g_hash_table_new_full( + g_str_hash, g_str_equal, g_free, free_monitor_device); + + if (confObj == NULL) { + return; + } + + /* get all sections. */ + sections = g_key_file_get_groups(confObj, &grpsize); + + if (sections == NULL) { + crm_err("config file has no section."); + g_key_file_free(confObj); + } + + /* for each section. */ + for (i = 0; i < grpsize; i++) { + crm_debug("read section [%s].", sections[i]); + /* get exec commandline for monitoring process. */ + if (g_key_file_has_key(confObj, sections[i], EXECCMDLINE_KEYWORD, NULL)) { + + value = g_key_file_get_string( + confObj, sections[i], EXECCMDLINE_KEYWORD, NULL); + + if (crm_str_eq(g_strstrip(value), "", 0)) { + crm_warn("the value of [%s] in section [%s] is empty" + " Ignore this section.", sections[i], EXECCMDLINE_KEYWORD); + g_free(value); + continue; + } + + monitor_device = (device_t *)device_t_new(); + monitor_device->attrName = g_strdup(sections[i]); + /* get command and arguments string from exec_cmdline string. */ + monitor_device->execCmdline = g_strdup(g_strstrip(value)); + crm_debug_2("set execute command line[%s].", + monitor_device->execCmdline); + monitor_device->splt_execCmdline = g_strsplit_set(monitor_device->execCmdline, " \t", 2); + monitor_device->prgName = \ + g_strdup(basename(monitor_device->splt_execCmdline[0])); + g_free(value); + + /* get timeout flag. */ + if (g_key_file_has_key(confObj, sections[i], + TIMEOUT_KEYWORD, NULL)) { + + value = g_key_file_get_string(confObj, sections[i], + TIMEOUT_KEYWORD, NULL); + + if (crm_str_eq(g_strstrip(value), "", FALSE)) { + crm_warn("the value of [%s] in section [%s] is empty.", + sections[i], TIMEOUT_KEYWORD); + monitor_device->freeze_check_timer = 0; + + } else if (crm_atoi(g_strstrip(value), NULL) < 0) { + crm_warn("Value[%s] of the [%s] is invalid.", + g_strstrip(value), TIMEOUT_KEYWORD); + monitor_device->freeze_check_timer = 0; + + } else { + monitor_device->freeze_check_timer = crm_atoi(g_strstrip(value), NULL); + check_value = crm_itoa(monitor_device->freeze_check_timer); + if (safe_str_neq(g_strstrip(value), check_value)) { + crm_warn("failed in the conversion of the value[%s] of [%s].", + g_strstrip(value), TIMEOUT_KEYWORD); + monitor_device->freeze_check_timer = 0; + } + crm_free(check_value); + } + crm_debug_2("set freeze check time-out in %d seconds.", + monitor_device->freeze_check_timer); + + g_free(value); + } + + crm_debug("insert monitor process[%s] in a hash table.", sections[i]); + g_hash_table_insert(monitor_device_hash, + g_strdup(sections[i]), monitor_device); + + } else { + crm_warn("section [%s] has no [%s]" + " Ignore this section.", sections[i], EXECCMDLINE_KEYWORD); + continue; + } + } + + g_strfreev(sections); + g_key_file_free(confObj); + + if (g_hash_table_size(monitor_device_hash) <= 0) { + crm_warn("config file has no valiable setting for" + " monitoring target."); + } + + return; +} + +/* + * read config file and create config object. + */ +static GKeyFile * +create_config_object(const char *confFile) +{ + GKeyFile *confObj = NULL; + + confObj = g_key_file_new(); /* create new object */ + + /* read config file */ + if (g_key_file_load_from_file(confObj, confFile, G_KEY_FILE_NONE, NULL) == FALSE) { + crm_err("failed to read config file [%s].", confFile); + return NULL; + } + + crm_info("succeeded in the reading of the config file[%s].", confFile); + + return confObj; +} + +/* + * Initiarize hb_vmmonitor's basic setting information. + */ +static void +vmmon_setting_t_new() +{ + internal_info = g_malloc0(sizeof(setting_t)); + + internal_info->pid = (int)getpid(); + internal_info->state = INIT; + internal_info->port = NULL; + internal_info->pidFile = g_strdup(PIDFILE_PATH); + internal_info->confFile = g_strdup(CONFFILE_PATH); + internal_info->sockarray = g_array_new(FALSE, TRUE, sizeof(int)); + + return; +} + +/* + * Initiarize monitoring target device information. + */ +static struct device_t * +device_t_new() +{ + device_t *dev = NULL; + dev = g_malloc(sizeof(device_t)); + + dev->attrName = NULL; + dev->attrValue = NULL; + dev->last_attrValue = NULL; + dev->prgName = NULL; + dev->execCmdline = NULL; + dev->splt_execCmdline = NULL; + dev->pid = 0; + dev->respawn = TRUE; + dev->shortrcount = 0; + dev->freeze_check_timer = 0; + dev->freeze_timer_id = 0; + + return (void *)dev; +} + +/* + * free hb_vmmonitor's basic setting. + */ +static void +free_vmmon_setting() +{ + int i; + + crm_debug("free internal_info."); + + g_free(internal_info->pidFile); + g_free(internal_info->confFile); + g_free(internal_info->port); + + for (i = 0;i < internal_info->sockarray->len; i++) { + close(g_array_index(internal_info->sockarray, int, i)); + } + + g_array_free(internal_info->sockarray, TRUE); + g_free(internal_info); + + return; +} + +/* + * free monitoring target device information. + */ +static void +free_monitor_device(gpointer data) +{ + device_t *dev; + + if (data == NULL) { + return; + } + + dev = (device_t *)data; + + crm_debug("free monitoring target device[%s].", dev->attrName); + + g_free(dev->attrName); + g_free(dev->attrValue); + g_free(dev->last_attrValue); + g_free(dev->prgName); + g_free(dev->execCmdline); + g_strfreev(dev->splt_execCmdline); + g_free(dev); + + return; +} + +/* + * kill all child processes and re-read config file. + * for dynamic configure modification. + */ +static gboolean +vmmon_refresh(int nsig, gpointer unused) +{ + + if (internal_info->state == MONSTOP || + internal_info->state == SHUTDOWN) { + crm_debug("refreshment of the monitor process was ignored."); + return TRUE; + } + + /* don't restart child processes. */ + internal_info->state = MONSTOP; + + crm_info("refresh monitoring process."); + + /* stop all child processes. */ + crm_debug("stop all monitoring processes."); + g_hash_table_foreach(monitor_device_hash, stop_monitor_process, NULL); + + /* + * confirm all child processes are dead every 1 second. + * after all are dead, re-read config file. + */ + g_timeout_add_full(G_PRIORITY_HIGH, 1000, + stop_check_monitor_process, NULL, NULL); + + return TRUE; +} + +/* + * set timeout sequence. + * see also the comment on stop_monitor_process(). + */ +static void +setproctimeouts(ProcTrackKillInfo *killseq, pid_t pid) +{ + int t_id = 0; + + killseq[0].mstimeout = SIGKILL_TIMEOUT * 1000; /* after 10 secs remove it */ + killseq[0].signalno = SIGKILL; + killseq[1].mstimeout = 5000; /* if it's still there after 5, complain */ + killseq[1].signalno = SIGZERO; + + if ((t_id = SetTrackedProcTimeouts(pid, killseq)) > 0) { + crm_debug_3("set timeout seq PID[%d]/Timer ID[%d].", pid, t_id); + } + + return; +} + +/* + * kill all child monitoring process. + * when it sends SIGTERM to a child process (without ProcTrack), + * send SIGKILL 10 second after (with ProcTrack). + */ +static void +stop_monitor_process(gpointer key, gpointer value, gpointer userdata) +{ + device_t *dev; + + if (value == NULL) { + return; + } + + dev = (device_t *)value; + + if (dev->pid > 0 && check_live_process(dev->pid, dev->prgName)) { + crm_debug_2("try to send SIGTERM to monitoring process[%s] (pid=%d)" + " for [%s].", dev->prgName, dev->pid, dev->attrName); + + /* do not respawn it */ + dev->respawn = FALSE; + + if (kill(dev->pid, SIGTERM) < 0) { + + if (errno == ESRCH) { + /* Mission accomplished! */ + crm_info("(PID %d) died before killing (try %d).", + dev->pid, SIGTERM); + dev->pid = 0; + return; + } else { + crm_err("%s: kill(%d,%d) failed." + ,__FUNCTION__, dev->pid, SIGTERM); + } + + } + + crm_debug("send SIGTERM to monitoring process[%s] (pid=%d)" + " for [%s].", dev->prgName, dev->pid, dev->attrName); + + /* for escalation to SIGKILL, use ProcTrack. */ + setproctimeouts(dev->killseq, dev->pid); + return; + } + + crm_debug("monitoring process[%s] (pid=%d) for [%s] already stopped.", + dev->prgName, dev->pid, dev->attrName); + + return; +} + +/* + * wrapper for starting a monitoring process function. + * for using g_hash. + */ +static void +start_monproc_wrapper(gpointer key, gpointer value, gpointer userdata) +{ + pid_t new_pid; + device_t *dev; + + if (value == NULL) { + return; + } + + dev = (device_t *)value; + + if (dev->pid > 0) { + crm_debug("monitoring process[%s] for [%s] is already started.", + dev->prgName, dev->attrName); + return; + } + + if (dev->attrValue != NULL) { + g_free(dev->attrValue); + dev->attrValue = NULL; + } + + crm_info("start monitoring process [%s] for [%s].", dev->prgName, dev->attrName); + if ((new_pid = start_monitor_process(dev)) > 0) { + + dev->pid = new_pid; + + if (dev->freeze_check_timer > 0) { + crm_info("set freeze check timer [%s] for [%s] (pid=%d) is %dsec.", + dev->prgName, dev->attrName, dev->pid, dev->freeze_check_timer); + dev->freeze_timer_id = g_timeout_add( + dev->freeze_check_timer * 1000, timeout_freeze_process, dev); + crm_debug_2("freeze check timer event id[%d]", dev->freeze_timer_id); + } + + } else { + crm_err("failed to start monitoring process [%s] for [%s].", + dev->prgName, dev->attrName); + return; + } + + return; +} + +/* + * start one child monitoring process. + */ +static int +start_monitor_process(gpointer data) +{ + int i; + pid_t pid; + device_t *dev; + char *cmdexec = NULL; + + if (data == NULL) { + return -1; + } + + dev = (device_t *)data; + + pid = fork(); + + if (pid < 0) { + crm_err("failed to start child process. can't start [%s] for [%s].", + dev->splt_execCmdline[0], dev->attrName); + + return -1; + } else if (pid > 0) { /* in the parent process */ + create_pid_file(pid, dev->prgName); + NewTrackedProc( pid, 0, PT_LOGVERBOSE, + g_strdup(dev->attrName), &MonitoredProcessTrackOps); + return pid; + } + + /* independent from parents' process group. */ + setpgid(0,0); + + for (i = 0;i < internal_info->sockarray->len; i++) { + close(g_array_index(internal_info->sockarray, int, i)); + } + + crm_debug("execute [%s] for [%s].", + dev->attrName, dev->execCmdline); + cmdexec = g_strjoin(" ", "exec", dev->splt_execCmdline[0], + dev->splt_execCmdline[1], (const char*)NULL); + + /* execute command to start the monitoring process. */ + if (-1 == execl("/bin/sh", "sh", "-c", cmdexec, (const char*)NULL)) { + cl_perror("failed to execute [%s]", dev->splt_execCmdline[0]); + } + g_free(cmdexec); + + /* the following is only for when it failed to execute execvp(). */ + /* if go here, there must be something wrong */ + + /* Since parameter error, donnot need to be respawned */ + exit(NORESTART_EXITCODE); +} + +/* + * check freeze for child process. + */ +static gboolean +timeout_freeze_process(gpointer userdata) +{ + device_t *dev; + + crm_warn("start killing process freeze."); + + if (userdata == NULL) { + return FALSE; + } + + dev = (device_t *)userdata; + + if (dev->pid == 0) { + crm_info("process[%s] died before checking.", dev->prgName); + return FALSE; + } + + if (kill(dev->pid, SIGTERM) < 0) { + + if (errno == ESRCH) { + /* Mission accomplished! */ + crm_info("(PID %d) died before checking (try %d).", + dev->pid, SIGTERM); + dev->pid = 0; + return FALSE; + } else { + crm_err("kill(%d, %d) failed.", + dev->pid, SIGTERM); + } + + } else { + crm_debug_2("send kill(%d, %d) succeseed.", dev->pid, SIGTERM); + } + + setproctimeouts(dev->killseq, dev->pid); + dev->freeze_timer_id = 0; + + return FALSE; +} + +/* + * create child monitoring process pid-file. + */ +static gboolean +create_pid_file(int pid, const char *prgName) +{ + int fd; + char *pidstr = NULL; + char pidfile_name[PATH_MAX]; + char *writeBuffer = NULL; + + pidstr = g_malloc0(10); + g_snprintf(pidstr, 10, "%d", pid); + g_snprintf(pidfile_name, sizeof(pidfile_name), "%s%s.%s", + CHLD_PIDFILE_DIR, prgName, pidstr); + + if (g_file_test(CHLD_PIDFILE_DIR, G_FILE_TEST_IS_DIR) != TRUE) { + + crm_info("create directory[%s].", CHLD_PIDFILE_DIR); + + if (g_mkdir_with_parents(CHLD_PIDFILE_DIR, URWXGRXORX) == -1){ + crm_err("failed to create directory[%s].",CHLD_PIDFILE_DIR); + return FALSE; + } + + } + + /* permission:644 + * write only + * create file + */ + if ((fd = open(pidfile_name, O_WRONLY|O_CREAT|O_TRUNC, + S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) == -1) { + /* failed to open */ + cl_perror("failed to open pid file[%s]", pidfile_name); + g_free(pidstr); + return FALSE; + } + + crm_debug_3("opened pid file [%s].", pidfile_name); + + writeBuffer = g_strjoin(" ", pidstr, prgName, NULL); + + if ((write(fd, writeBuffer, strlen(writeBuffer))) == -1) { + /* failed to write */ + cl_perror("failed to write for pid file[%s]", pidfile_name); + g_free(pidstr); + g_free(writeBuffer); + close(fd); + return FALSE; + } + + crm_debug_3("wrote pid number [%s] in pid file.", pidstr); + + g_free(pidstr); + g_free(writeBuffer); + close(fd); + + return TRUE; +} + +/* + * check process lives. + */ +static gboolean +check_chldproc_lives() +{ + GDir *pidDir; + const char *pidfile = NULL; + GIOChannel *pidfile_ch = NULL; + char *line = NULL; + char *pidfile_path = NULL; + char **tmp_split = NULL; + char *prgName = NULL; + int pid, timer; + + /* Existence confirmation to pid directory */ + if (g_file_test(CHLD_PIDFILE_DIR, G_FILE_TEST_IS_DIR) != TRUE) { + crm_info("not exist directory[%s], do not check child process.", + CHLD_PIDFILE_DIR); + return TRUE; + } + + if ((pidDir = g_dir_open(CHLD_PIDFILE_DIR, 0, NULL)) == NULL) { + crm_err("failed to open pid directory[%s].", CHLD_PIDFILE_DIR); + return FALSE; + } + + crm_debug_2("open pid directory[%s].", CHLD_PIDFILE_DIR); + + /* read pid directory to get pid-file from sequentially. + * need after g_dir_close(). + */ + while ((pidfile = g_dir_read_name(pidDir)) != NULL) { + crm_debug_3("read pid file[%s] from pid directory.", pidfile); + pidfile_path = g_strconcat(CHLD_PIDFILE_DIR, pidfile, NULL); + + if ((pidfile_ch = g_io_channel_new_file( + pidfile_path, "r", NULL)) == NULL) { + /* failed to pid-file open */ + crm_err("failed to open pid file[%s].", pidfile); + goto pidfile_remove; + } + + if (g_io_channel_read_line(pidfile_ch, &line, + NULL, NULL, NULL) != G_IO_STATUS_NORMAL) { + crm_err("failed to read line from file[%s].", pidfile); + goto ioch_shutdown; + } + + tmp_split = g_strsplit(line, " ", 0); + g_free(line); + prgName = g_strdup(tmp_split[1]); + + if ((pid = crm_atoi(tmp_split[0], "0")) == 0) { + crm_err("failed to get process id for pid-file."); + g_strfreev(tmp_split); + goto ioch_shutdown; + } + g_strfreev(tmp_split); + + if (check_live_process(pid, prgName) != TRUE) { + g_free(prgName); + goto ioch_shutdown; + } + + crm_warn("process name[%s] pid[%d] is still alive.", + prgName, pid); + g_free(prgName); + crm_info("send SIGTERM for pid[%d].", pid); + + kill(pid, SIGTERM); + timer = SIGKILL_TIMEOUT; + + while (kill(pid, SIGZERO) == 0) { + + crm_debug_2("set a timer at wait time for process end to [%d]sec.", timer); + + if (timer < 1) { + crm_warn("Wait time for process end did time-out."); + crm_info("send SIGKILL for pid[%d].", pid); + kill(pid, SIGKILL); + break; + } + + timer--; + sleep(1); + } + +ioch_shutdown: + if (g_io_channel_shutdown(pidfile_ch, FALSE, NULL) != \ + G_IO_STATUS_NORMAL) { + crm_err("failed to close file[%s].", pidfile); + } + g_io_channel_unref(pidfile_ch); + +pidfile_remove: + if (remove(pidfile_path) == -1) { + cl_perror("failed to remove pid file[%s]", pidfile_path); + } + g_free(pidfile_path); + + } + + g_dir_close(pidDir); + + return TRUE; +} + +/* + * print Usage. + */ +static void +usage(const char *cmd, int exit_status) +{ + FILE *stream; + + stream = exit_status ? stderr : stdout; + + fprintf(stream, "usage: %s -p [-cVD?]\n", cmd); + fprintf(stream, "\t--%s (-%c) \t\tPort number for waiting messages from hb_vmmon_client. (%d - %d)\n" + "\t\t\t\t\t* required option\n", "port", 'p', MIN_PORTNUM, MAX_PORTNUM); + fprintf(stream, "\t--%s (-%c) \tFull path of config file.\n" + "\t\t\t\t\t* Default=%s\n", "config-file", 'c', CONFFILE_PATH); + fprintf(stream, "\t--%s (-%c) \t\t\tRun in verbose mode\n", "verbose", 'V'); + fprintf(stream, "\t--%s (-%c) \t\tRun in daemon mode\n", "daemonize", 'D'); + fprintf(stream, "\t--%s (-%c) \t\t\tThis text\n", "help", '?'); + fflush(stream); + + exit(exit_status); +} + +/* + * main function. + */ +int +main(int argc, char **argv) +{ + int argerr = 0; + int flag; + char *channel_name = g_strdup("hb_vmmonitor"); + GKeyFile *confObj = NULL; + gboolean daemonize = FALSE; + +#ifdef HAVE_GETOPT_H + int option_index = 0; + static struct option long_options[] = { + /* Top-level Options */ + {"verbose", 0, 0, 'V'}, + {"help", 0, 0, '?'}, + {"daemonize", 0, 0, 'D'}, + {"port", 1, 0, 'p'}, + {"config-file", 1, 0, 'c'}, + {0, 0, 0, 0} + }; +#endif + + setenv("HA_use_logd", "on", 1); + crm_system_name = basename(argv[0]); + crm_log_init(basename(argv[0]), LOG_INFO, TRUE, FALSE, argc, argv); + + /* set signal handler. */ + G_main_add_SignalHandler(G_PRIORITY_HIGH, + SIGHUP, vmmon_refresh, NULL, NULL); + G_main_add_SignalHandler(G_PRIORITY_HIGH, + SIGTERM, vmmon_sig_shutdown, NULL, NULL); + G_main_add_SignalHandler(G_PRIORITY_HIGH, + SIGINT, vmmon_sig_shutdown, NULL, NULL); + /* debug code + G_main_add_SignalHandler( + G_PRIORITY_HIGH, SIGQUIT, print_debug, NULL, NULL); + */ + /* handle child processes' death. */ + set_sigchld_proctrack(G_PRIORITY_HIGH, DEFAULT_MAXDISPATCHTIME); + + /* create new basic setting table and set default value. */ + vmmon_setting_t_new(); + + while (1) { +#ifdef HAVE_GETOPT_H + flag = getopt_long(argc, argv, OPTARGS, + long_options, &option_index); +#else + flag = getopt(argc, argv, OPTARGS); +#endif + if (flag == -1) + break; + + switch(flag) { + case 'c': + g_free(internal_info->confFile); + internal_info->confFile = g_strdup(optarg); + break; + case 'D': + daemonize = TRUE; + break; + case 'p': + if (crm_atoi(optarg, "0") < MIN_PORTNUM || + crm_atoi(optarg, "0") > MAX_PORTNUM) { + usage(crm_system_name, 1); + } + + internal_info->port = g_strdup(optarg); + break; + case 'V': + cl_log_enable_stderr(TRUE); + alter_debug(DEBUG_INC); + break; + case '?': + usage(crm_system_name, 1); + break; + default: + printf("Argument code 0%o (%c) is not (?yet?) supported.\n", + flag, flag); + crm_err("Argument code 0%o (%c) is not (?yet?) supported.\n", + flag, flag); + ++argerr; + break; + } + } + + if (optind < argc) { + + crm_err("non-option ARGV-elements: "); + printf("non-option ARGV-elements: "); + + while (optind < argc) { + crm_err("%s ", argv[optind]); + printf("%s ", argv[optind]); + optind++; + } + + printf("\n"); + } + + if (argerr || internal_info->port == NULL) { + usage(crm_system_name, 1); + } + + crm_make_daemon(crm_system_name, daemonize, internal_info->pidFile); + + crm_notice("Starting %s.", crm_system_name); + + /* create socket for waiting messages from hb_vmmon_client on DomU. */ + if (0 != create_socket_do_listen()) { + crm_err("failed to create socket."); + exit(1); + } + + /* the check of the process that operated before start. + * terminate it if a process lives. + */ + if (check_chldproc_lives() != TRUE) { + crm_err("failed to stop pid process."); + exit(1); + } + + /* create a channel for connection with monitoring processes. */ + if(0 != init_server_ipc_comms(channel_name, vmmon_ipc_connect, + default_ipc_connection_destroy)) { + crm_err("Could not start IPC server."); + exit(1); + } + + /* read config to get monitoring target devices' info. */ + confObj = create_config_object(internal_info->confFile); + create_device_tables(confObj); + + mainloop = g_main_loop_new(NULL, FALSE); + g_main_loop_run(mainloop); + + g_hash_table_destroy(monitor_device_hash); + g_free(channel_name); + free_vmmon_setting(); + + crm_notice("Exiting %s.", crm_system_name); + + return 0; +} + +/* + * create socket and listen. + * for waiting messages from hb_vmmon_client on DomU. + */ +static int +create_socket_do_listen() +{ + int err; + int on = 1; + int sockfd = -1; + struct addrinfo hints; + struct addrinfo* res = NULL; + struct addrinfo* addri; + GIOChannel *sch; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; + + /* arg1 == NULL means INADDR_ANY or IN6ADDR_ANY_INIT. */ + err = getaddrinfo(NULL, internal_info->port, &hints, &res); + + if (err != 0) { + crm_err("failed to get network address information: %s.", + gai_strerror(err)); + freeaddrinfo(res); + return -1; + } + + for (addri = res; addri; addri = addri->ai_next) { + sockfd = socket(addri->ai_family, addri->ai_socktype, addri->ai_protocol); + + if (sockfd < 0) { + cl_perror("failed to socket()"); + return -1; + } + + g_array_append_val(internal_info->sockarray, sockfd); + + /* IPv6 socket is accept to only ipv6 protocol */ + if (addri->ai_family == AF_INET6) { + if (setsockopt(sockfd, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)) < 0) { + cl_perror("failed to setsockopt()"); + return -1; + } else { + crm_debug_3("set IPV6_V6ONLY\n"); + } + } + + if (addri->ai_family == AF_INET || addri->ai_family == AF_INET6) { + if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) { + cl_perror("failed to setsockopt()"); + return -1; + } else { + crm_debug_3("set SO_REUSEADDR\n"); + } + } + + if (bind(sockfd, addri->ai_addr, addri->ai_addrlen) < 0) { + cl_perror("failed to bind()"); + return -1; + } + + if (listen(sockfd, SOMAXCONN) < 0) { + cl_perror("failed to listen()"); + return -1; + } + + sch = g_io_channel_unix_new(sockfd); + g_io_add_watch_full(sch, G_PRIORITY_HIGH, + G_IO_IN, on_listen, NULL, NULL); + } + + freeaddrinfo(res); + + return 0; + +} + +/* + * callback for when it gets a message from hb_vmmon_client. + * reply the status of specified device. + * If the monitoring process for specified device is not running yet, + * start it, then, reply empty string as device status and pid of monitoring + * process. it means "monitoring started, but don't get device's status yet." + * so, client knows the device's status after the second request. + */ +static gboolean +on_listen(GIOChannel *source, GIOCondition condition, gpointer data) +{ + int ssock, csock; + unsigned laddr; + struct sockaddr_in addr; + client_msg_t *clientMsg = NULL; + reply_msg_t *replyMsg = NULL; + int pid = 0; + device_t *dev = NULL; + fd_set rfds; + struct timeval timeout; + int select_ret; + + crm_debug_3("client connection event."); + + if (condition & G_IO_IN) { + ssock = g_io_channel_unix_get_fd(source); + laddr = sizeof(addr); + csock = accept(ssock, (struct sockaddr*)&addr, &laddr); + + if (csock == -1) { + cl_perror("failed to accept socket"); + return TRUE; + } + + crm_debug("client[%s] was connected.", inet_ntoa(addr.sin_addr)); + + if (internal_info->state == INIT) { + /* start devices' status monitoring. */ + crm_info("initial start monitoring process."); + g_hash_table_foreach(monitor_device_hash, + start_monproc_wrapper, NULL); + /* here, then all monitoring process started. */ + internal_info->state = NORMAL; + } + + if ((pid = fork()) < 0) { + crm_err("failed to create child process" + " can't receive message from hb_vmmon_client."); + close(csock); + return TRUE; + } else if (pid > 0) { + /* parent process. */ + close(csock); /* close client's socket */ + return TRUE; + } + + /* child process. */ + + close(ssock); + crm_malloc0(clientMsg, sizeof(client_msg_t)); + + timeout.tv_sec = RECV_TIMEOUT; + timeout.tv_usec = 0; + FD_ZERO(&rfds); + FD_SET(csock, &rfds); + select_ret = select(FD_SETSIZE, &rfds, (fd_set *)NULL, + (fd_set *)NULL, &timeout); + + if (select_ret == -1) { + crm_err("select failure occurred."); + close(csock); + exit(1); + } else if (select_ret) { + + /* receive message from client. */ + if (-1 == recv(csock, clientMsg, sizeof(client_msg_t), 0)) { + cl_perror("failed to recive msg"); + close(csock); + exit(1); + } + + crm_debug_2("recv attr name [%s].", clientMsg->attrName); + + dev = g_hash_table_lookup(monitor_device_hash, clientMsg->attrName); + + if (dev == NULL) { + crm_warn("received a unidentified attribute name[%s] from client[%s].", + clientMsg->attrName, inet_ntoa(addr.sin_addr)); + close(csock); + exit(1); + } + + /* create reply message. */ + crm_malloc0(replyMsg, sizeof(reply_msg_t)); + replyMsg->vmmon_host_state = internal_info->state; + replyMsg->check_pid = dev->pid; + strlcpy(replyMsg->attrName, dev->attrName, + strlen(dev->attrName) + 1); + + if (dev->attrValue != NULL) { + strlcpy(replyMsg->attrValue, dev->attrValue, + strlen(dev->attrValue) + 1); + } + + crm_debug_2("send attr name [%s] value [%s].", + replyMsg->attrName, replyMsg->attrValue); + + /* send it. */ + if (-1 == send(csock, replyMsg, sizeof(reply_msg_t), + MSG_DONTWAIT|MSG_NOSIGNAL)) { + cl_perror("failed to send msg for [%s]", dev->attrName); + close(csock); + exit(1); + } + + } else { + crm_err("failed to receive message from hb_vmmonitor" + " (timed out)."); + } + + close(csock); + exit(0); + } + + return TRUE; +} + +/* + * connect to a monitoring process. + * ref. attrd_connect() in tools/attrd.c + */ +static gboolean +vmmon_ipc_connect(IPC_Channel *wchannel, gpointer user_data) +{ + vmmon_client_t *new_client = NULL; + + crm_debug_3("Connection came in ipc channel."); + + if(wchannel == NULL) { + crm_err("ipc channel was NULL."); + return FALSE; + } else if(wchannel->ch_status != IPC_CONNECT) { + crm_err("ipc channel was disconnected."); + return FALSE; + } else if(internal_info->state == SHUTDOWN) { + crm_info("Ignoring connection request during shutdown."); + return FALSE; + } + + crm_malloc0(new_client, sizeof(vmmon_client_t)); + new_client->channel = wchannel; + + /* set callback to get message from monitoring process. */ + new_client->source = G_main_add_IPC_Channel( + G_PRIORITY_HIGH, wchannel, FALSE, vmmon_ipc_callback, + new_client, vmmon_connection_destroy); + + new_client->id = g_source_get_id((GSource *)new_client->source); + + crm_debug_3("Client id[%u] connected.", new_client->id); + + return TRUE; +} + +/* + * callback for when it gets message from monitoring process. + * ref. attrd_ipc_callback() in tools/attrd.c + */ +static gboolean +vmmon_ipc_callback(IPC_Channel *client, gpointer user_data) +{ + int lpc = 0; + HA_Message *msg = NULL; + vmmon_client_t *curr_client = (vmmon_client_t *)user_data; + gboolean stay_connected = TRUE; + + crm_debug_3("Invoked: monitoring process id[%u].", curr_client->id); + + while(IPC_ISRCONN(client)) { + + if(client->ops->is_message_pending(client) == 0) { + break; + } + + msg = msgfromIPC_noauth(client); + + if (msg == NULL) { + crm_debug("monitoring process id[%u]: no message this time.", curr_client->id); + continue; + } + + lpc++; + + crm_debug_2("Processing msg from monitoring process id[%u].", curr_client->id); + + vmmon_local_callback(msg); + + crm_msg_del(msg); + msg = NULL; + + if(client->ch_status != IPC_CONNECT) { + break; + } + } + + crm_debug_2("Processed %d messages.", lpc); + + if (client->ch_status != IPC_CONNECT) { + stay_connected = FALSE; + } + + return stay_connected; +} + +/* + * parse message from monitoring process. + */ +void +vmmon_local_callback(HA_Message * msg) +{ + device_t *dev = NULL; + const char *from = ha_msg_value(msg, F_ORIG); + const char *op = ha_msg_value(msg, F_ATTRD_TASK); + const char *attr = ha_msg_value(msg, F_ATTRD_ATTRIBUTE); + const char *value = ha_msg_value(msg, F_ATTRD_VALUE); + + crm_debug("%s message from %s: %s=%s.", op, from, attr, crm_str(value)); + + /* get specified device's information. */ + dev = (device_t *)g_hash_table_lookup(monitor_device_hash, attr); + + if(dev == NULL) { + crm_err("[%s] is not a monitoring target.", attr); + return; + } + + if (dev->freeze_timer_id != 0) { + /* reset freeze timeout. */ + crm_debug_2("remove freeze check timer event id[%d].", + dev->freeze_timer_id); + + if (g_source_remove(dev->freeze_timer_id) != TRUE) { + crm_warn("failed to remove freeze check timer event id[%d].", + dev->freeze_timer_id); + } + + dev->freeze_timer_id = g_timeout_add( + dev->freeze_check_timer * 1000, timeout_freeze_process, dev); + } + + if(safe_str_eq(dev->attrValue, value)) { + crm_debug("Ignore since status doesn't change."); + return; + } + + /* + * hold last status value to last_attrValue, and + * set current one to attrValue. + */ + + crm_info("attribute value of attribute name[%s] is updated: [%s] => [%s].", + dev->attrName, dev->attrValue, value); + + g_free(dev->last_attrValue); + dev->last_attrValue = dev->attrValue; + + if(value != NULL) { + dev->attrValue = g_strdup(value); + } else { + dev->attrValue = NULL; + } + + return; +} + +/* + * destroy the connection to a monitoring process. + * it is called when the connection is dead + * (maybe it means the monitoring process is dead). + */ +static void +vmmon_connection_destroy(gpointer user_data) +{ + vmmon_client_t *client = user_data; + + /* monitoring process disconnect */ + if(client == NULL) { + return; + } + + if(client->source != NULL) { + crm_debug_3("Deleting ipc channel for monitoring process id[%u] (%p) from mainloop.", + client->id, client->source); + G_main_del_IPC_Channel(client->source); + client->source = NULL; + } + + crm_free(client); + crm_debug("freed the session information for monitoring process."); + + return; +} + +/* + * check live for process. + */ +static gboolean +check_live_process(int pid, const char *prgName) +{ + GIOChannel *status_ch; + char statfile[PATH_MAX]; + char *line = NULL; + char **tmp_split = NULL; + char *stat_prgName = NULL; + + if ((kill(pid, SIGZERO) < 0) && ESRCH == errno) { + crm_info("process name[%s] pid[%d] is already dead.", + prgName, pid); + + /* process is dead */ + return FALSE; + } + + /* check executed program name for /proc/[pid]/stat */ + g_snprintf(statfile, sizeof(statfile), "/proc/%d/status", pid); + + if ((status_ch = g_io_channel_new_file(statfile, "r", NULL)) == NULL) { + /* failed to open */ + crm_err("failed to open file[%s].", statfile); + /* process is dead */ + return FALSE; + } + + crm_debug_3("opened file [%s].", statfile); + + if (g_io_channel_read_line(status_ch, &line, NULL, NULL, NULL) != \ + G_IO_STATUS_NORMAL) { + crm_err("failed to read line from file[%s].", statfile); + + if (g_io_channel_shutdown(status_ch, FALSE, NULL) != \ + G_IO_STATUS_NORMAL) { + crm_err("failed to close file[%s].", statfile); + } + + g_io_channel_unref(status_ch); + /* process is dead */ + return FALSE; + } + + tmp_split = g_strsplit(line, ":", 0); + stat_prgName = g_strdup(g_strstrip(tmp_split[1])); + crm_debug_3("get program name[%s] for status file[%s].", + stat_prgName, statfile); + + if (safe_str_eq(prgName, stat_prgName) != TRUE) { + crm_info("pid is the same, but processes are different."); + g_strfreev(tmp_split); + g_free(stat_prgName); + g_free(line); + + if (g_io_channel_shutdown(status_ch, FALSE, NULL) != \ + G_IO_STATUS_NORMAL) { + crm_err("failed to close file[%s].", statfile); + } + + g_io_channel_unref(status_ch); + /* process is dead */ + return FALSE; + } + + if (g_io_channel_shutdown(status_ch, FALSE, NULL) != \ + G_IO_STATUS_NORMAL) { + crm_err("failed to close file[%s].", statfile); + } + + g_io_channel_unref(status_ch); + g_strfreev(tmp_split); + g_free(stat_prgName); + g_free(line); + + /* process is alive */ + return TRUE; +} + +/* + * check whether a monitoring process is dead or alive. + */ +static void +check_process_dead(gpointer key, gpointer value, gpointer userdata) +{ + int *liveCnt = (int *)userdata; + device_t *dev; + + if (value == NULL) { + return; + } + + dev = (device_t *)value; + + if (dev->pid > 0) { + + if (check_live_process(dev->pid, dev->prgName)) { + crm_debug_2("attrName[%s] (%s[%d]) is active.", + dev->attrName, dev->prgName, dev->pid); + (*liveCnt)++; + } + + } else { + crm_debug_2("attrName[%s] (%s) is died.", + dev->attrName, dev->prgName); + } + + return; +} + +/* + * check all monitoring process are dead. + * it is called when hb_vmmonitor is shutting down or refresh. + * when hb_vmmonitor's status is "MONSTOP", it means "do refresh", + * so re-read config file and re-construct device information table. + * and when its state is "SHUTDOWN", destroy mainloop, then + * hb_vmmonitor is going to be dead. + */ +static gboolean +stop_check_monitor_process(gpointer userdata) +{ + int act_cnt = 0; + GKeyFile *confObj = NULL; + + if (g_hash_table_size(monitor_device_hash) > 0) { + crm_debug_2("check whether all monitoring process are dead."); + /* confirm each process's death. */ + g_hash_table_foreach(monitor_device_hash, + check_process_dead, &act_cnt); + + if (act_cnt > 0) { + crm_debug_2("some monitoring processes are still alive."); + return TRUE; + } + + crm_info("all monitoring process are dead."); + } + + if (internal_info->state == MONSTOP) { + /* re-read config file. */ + g_hash_table_destroy(monitor_device_hash); + crm_info("re-read config file."); + confObj = create_config_object(internal_info->confFile); + /* re-construct device information table. */ + create_device_tables(confObj); + internal_info->state = INIT; + } else if (internal_info->state == SHUTDOWN) { + if (mainloop != NULL && g_main_loop_is_running(mainloop)) { + g_main_loop_quit(mainloop); + } else { + g_hash_table_destroy(monitor_device_hash); + exit(0); + } + } + + /* remove timeout for check children's dead */ + return FALSE; +} + +/********** for debug **********/ +/* +static gboolean +print_debug(int nsig, gpointer unused) +{ + if (g_hash_table_size(monitor_device_hash) > 0) { + crm_debug_2("output debug print."); + g_hash_table_foreach(monitor_device_hash, + debug_write_log, NULL); + g_hash_table_foreach(monitor_device_hash, + debug_write_file, NULL); + } else { + crm_debug("monitor_device_hash is empty."); + } + return TRUE; +} + +static void +debug_write_log(gpointer key, gpointer value, gpointer userdata) +{ + device_t *dev = NULL; + if (value == NULL) { + return; + } + dev = (device_t *)value; + crm_mon_debug_3("========== debug_print =========="); + crm_mon_debug_3("key[%s] ", (char *)key); + crm_mon_debug_3("attrName[%s] ", dev->attrName); + crm_mon_debug_3("attrValue[%s] ", dev->attrValue); + crm_mon_debug_3("last_attrValue[%s] ", dev->last_attrValue); + crm_mon_debug_3("program_name[%s] ", dev->prgName); + crm_mon_debug_3("execpath[%s] ", dev->execCmdline); + crm_mon_debug_3("pid[%d] ", dev->pid); + crm_mon_debug_3("shortrcount[%d] ", dev->shortrcount); + + return; +} + +static void +debug_write_file(gpointer key, gpointer value, gpointer userdata) +{ + char xmlBuffer[128]; + char *tmpBuffer = NULL; + char *xmlp = NULL; + int fd; + device_t *dev = NULL; + if (value == NULL) { + return; + } + dev = (device_t *)value; + + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", (char *)key); + xmlp = g_strjoin("\n", xmlBuffer, NULL); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->attrName); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->attrValue); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->last_attrValue); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->prgName); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->execCmdline); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->pid); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + "", dev->shortrcount); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + memset(xmlBuffer, '\0', sizeof(xmlBuffer)); + g_snprintf(xmlBuffer, sizeof(xmlBuffer), + ""); + tmpBuffer = xmlp; + xmlp = g_strjoin("\n", tmpBuffer, xmlBuffer, NULL); + g_free(tmpBuffer); + + if ((fd = open("/tmp/hb_vmmonitor.mon", O_WRONLY|O_CREAT|O_APPEND, + S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) == -1) { + g_free(xmlp); + return; + } + + if ((write(fd, xmlp, strlen(xmlp))) == -1) { + g_free(xmlp); + close(fd); + return; + } + + g_free(xmlp); + close(fd); + + return; +} +*/ + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.conf.sample Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.conf.sample --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.conf.sample 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.conf.sample 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,15 @@ +# basic setting for pingd. +[default_ping_set] +timeout = 30 +exec_cmdline = /usr/lib64/heartbeat/hb-vmmonitor/pingd -N 127.0.0.1 -m 100 -a default_ping_set + +# basic setting for diskd.(read only check) +[diskcheck_status] +timeout = 90 +exec_cmdline = /usr/lib64/heartbeat/hb-vmmonitor/diskd -N /dev/sdb -P -a diskcheck_status + +# basic setting for diskd.(check is write for internal disk) +[internal_diskcheck_status] +timeout = 90 +exec_cmdline = /usr/lib64/heartbeat/hb-vmmonitor/diskd -w -P -a internal_diskcheck_status + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.spec.in Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.spec.in --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.spec.in 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.spec.in 2010-05-14 12:14:33.000000000 +0900 @@ -0,0 +1,98 @@ +######################################## +# Derived definitions +######################################## +%define name hb-vmmonitor +%define version 1.00 +%define release 1.el5 +%define prefix @libdir@/@PACKAGE@/ +%define CMDNAME hb_vmmonitor +%define CONFSAMPLE %{CMDNAME}.conf.sample +%define ORGVERSION STABLE-2.1.4 +%define ORGARCH Heartbeat-2-1-%{ORGVERSION} +# +# +# +Summary: Heartbeat Virtual Machine Monitor +Name: %{name} +Version: %{version} +Release: %{release} +Group: Applications +Source: %{ORGVERSION}.tar.gz +Patch: %{name}-%{version}-%{release}.patch +License: GPL/LGPL +Vendor: NIPPON TELEGRAPH AND TELEPHONE CORPORATION +BuildRoot: %{_tmppath}/%{name}-%{version} +BuildRequires: autoconf, automake libtool +Requires: heartbeat = 2.1.4-1 + +######################################## +%description +######################################## +Virtual Machine Monitor for Heartbeat. + +######################################## +%prep +######################################## +rm -rf $RPM_BUILD_ROOT +%setup -q -n %{ORGARCH} +%patch -p1 +pushd $RPM_BUILD_DIR/%{ORGARCH} +./ConfigureMe bootstrap +popd + +######################################## +%build +######################################## +pushd $RPM_BUILD_DIR/%{ORGARCH}/replace +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/libltdl +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/lib +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/tools/%{CMDNAME} +make DESTDIR=$RPM_BUILD_ROOT +popd + +######################################## +%install +######################################## +pushd $RPM_BUILD_DIR/%{ORGARCH}/tools/%{CMDNAME} +make DESTDIR=$RPM_BUILD_ROOT install +popd + +######################################## +%clean +######################################## +if + [ -n "${RPM_BUILD_ROOT}" -a "${RPM_BUILD_ROOT}" != "/" ] +then + rm -rf $RPM_BUILD_ROOT +fi +rm -rf $RPM_BUILD_DIR/%{ORGARCH} + +######################################## +%post +######################################## +true +######################################## +%preun +######################################## +true +######################################## +%postun +######################################## +true +######################################## +%files +######################################## +%defattr(-,root,root) +%dir %{prefix}/%{name} +%{prefix}/%{name}/%{CMDNAME} +%{prefix}/%{name}/%{CONFSAMPLE} +%{prefix}/%{name}/hb_vmmon_client +%{prefix}/%{name}/pingd +%{prefix}/%{name}/diskd + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.spec.in.orig Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.spec.in.orig --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/hb_vmmonitor.spec.in.orig 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/hb_vmmonitor.spec.in.orig 2010-05-14 10:55:54.000000000 +0900 @@ -0,0 +1,97 @@ +######################################## +# Derived definitions +######################################## +%define name hb-vmmonitor +%define version 1.00 +%define release 1.el5 +%define prefix @libdir@/@PACKAGE@/ +%define CMDNAME hb_vmmonitor +%define CONFSAMPLE %{CMDNAME}.conf.sample +%define ORGARCH heartbeat-2.1.4-1 +# +# +# +Summary: Heartbeat Virtual Machine Monitor +Name: %{name} +Version: %{version} +Release: %{release} +Group: Applications +Source: %{ORGARCH}.tar.gz +Patch: %{name}-%{version}-%{release}.patch +License: GPL/LGPL +Vendor: NIPPON TELEGRAPH AND TELEPHONE CORPORATION +BuildRoot: %{_tmppath}/%{name}-%{version} +BuildRequires: autoconf, automake libtool +Requires: heartbeat = 2.1.4-1 + +######################################## +%description +######################################## +Virtual Machine Monitor for Heartbeat. + +######################################## +%prep +######################################## +rm -rf $RPM_BUILD_ROOT +%setup -q -n %{ORGARCH} +%patch -p1 +pushd $RPM_BUILD_DIR/%{ORGARCH} +./ConfigureMe bootstrap +popd + +######################################## +%build +######################################## +pushd $RPM_BUILD_DIR/%{ORGARCH}/replace +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/libltdl +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/lib +make DESTDIR=$RPM_BUILD_ROOT +popd +pushd $RPM_BUILD_DIR/%{ORGARCH}/tools/%{CMDNAME} +make DESTDIR=$RPM_BUILD_ROOT +popd + +######################################## +%install +######################################## +pushd $RPM_BUILD_DIR/%{ORGARCH}/tools/%{CMDNAME} +make DESTDIR=$RPM_BUILD_ROOT install +popd + +######################################## +%clean +######################################## +if + [ -n "${RPM_BUILD_ROOT}" -a "${RPM_BUILD_ROOT}" != "/" ] +then + rm -rf $RPM_BUILD_ROOT +fi +rm -rf $RPM_BUILD_DIR/%{ORGARCH} + +######################################## +%post +######################################## +true +######################################## +%preun +######################################## +true +######################################## +%postun +######################################## +true +######################################## +%files +######################################## +%defattr(-,root,root) +%dir %{prefix}/%{name} +%{prefix}/%{name}/%{CMDNAME} +%{prefix}/%{name}/%{CONFSAMPLE} +%{prefix}/%{name}/hb_vmmon_client +%{prefix}/%{name}/pingd +%{prefix}/%{name}/diskd + diff -urN Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/pingd.c Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/pingd.c --- Heartbeat-2-1-STABLE-2.1.4/tools/hb_vmmonitor/pingd.c 1970-01-01 09:00:00.000000000 +0900 +++ Heartbeat-2-1-STABLE-2.1.4.mod/tools/hb_vmmonitor/pingd.c 2010-05-14 10:29:56.000000000 +0900 @@ -0,0 +1,1505 @@ + +/* + * Copyright (C) 2004 Andrew Beekhof + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + + +#ifdef HAVE_SYS_SOCKET_H +# include +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#ifdef ON_LINUX +#include +#include +# ifndef ICMP_FILTER +# define ICMP_FILTER 1 +struct icmp_filter { + uint32_t data; +}; +# endif +#endif + +#ifdef HAVE_GETOPT_H +# include +#endif + + + + +#ifdef HAVE_GETOPT_H +# include +#endif + +#if SUPPORT_HEARTBEAT +# include +ll_cluster_t *pingd_cluster = NULL; +void do_node_walk(ll_cluster_t *hb_cluster); +#endif + +/* GMainLoop *mainloop = NULL; */ +/* +#define OPTARGS "V?p:a:d:s:S:h:Dm:N:Ui:t:n:" +*/ +#define OPTARGS "V?p:a:h:Dm:N:Ui:t:n:" +#define do_crm_log_always(level, fmt, args...) cl_log(level, "%s: " fmt, __PRETTY_FUNCTION__ , ##args) + +#define crm_perror(level, fmt, args...) do { \ + const char *err = strerror(errno); \ + do_crm_log_always(level, fmt ": %s (%d)", ##args, err, errno); \ + } while(0) + +GListPtr ping_list = NULL; +GMainLoop* mainloop = NULL; +GHashTable *ping_nodes = NULL; +const char *pingd_attr = "pingd"; +gboolean do_filter = FALSE; +gboolean need_shutdown = FALSE; +gboolean stand_alone = FALSE; +gboolean do_updates = TRUE; + +const char *attr_set = NULL; +const char *attr_section = NULL; +int attr_dampen = 5000; /* 5s */ +int attr_multiplier = 1; +int pings_per_host = 2; +int ping_timeout = 2; +int re_ping_interval = 1000; /* 1s */ + +int ident; /* our pid */ + +unsigned char cmsgbuf[4096]; +int cmsglen = 0; + +typedef struct ping_node_s { + int fd; /* ping socket */ + uint16_t iseq; /* sequence number */ + gboolean type; + gboolean extra_filters; + union { + struct sockaddr raw; + struct sockaddr_in v4; /* ipv4 ping addr */ + struct sockaddr_in6 v6; /* ipv6 ping addr */ + } addr; + char dest[256]; + char *host; +} ping_node; + +void pingd_nstatus_callback( + const char *node, const char *status, void *private_data); +void pingd_lstatus_callback( + const char *node, const char *link, const char *status, + void *private_data); +void send_update(int active); +int process_icmp6_error(ping_node *node, struct sockaddr_in6 *whereto); +int process_icmp4_error(ping_node *node, struct sockaddr_in *whereto); + +/* + * in_cksum -- + * Checksum routine for Internet Protocol family headers (C Version) + * This function taken from Mike Muuss' ping program. + */ +static int +in_cksum (u_short *addr, size_t len) +{ + size_t nleft = len; + u_short * w = addr; + int sum = 0; + u_short answer = 0; + + /* + * The IP checksum algorithm is simple: using a 32 bit accumulator (sum) + * add sequential 16 bit words to it, and at the end, folding back all + * the carry bits from the top 16 bits into the lower 16 bits. + */ + while (nleft > 1) { + sum += *w++; + nleft -= 2; + } + + /* Mop up an odd byte, if necessary */ + if (nleft == 1) { + sum += *(u_char*)w; + } + + /* Add back carry bits from top 16 bits to low 16 bits */ + + sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ + sum += (sum >> 16); /* add carry */ + answer = ~sum; /* truncate to 16 bits */ + + return answer; +} + +static const char *ping_desc(gboolean family, uint8_t type, uint8_t code) +{ + if(family == AF_INET6) { + switch(type) { + case ICMP6_DST_UNREACH: + switch(code) { + case ICMP6_DST_UNREACH_NOROUTE: + return "No Route to Destination"; + case ICMP6_DST_UNREACH_ADMIN: + return "Destination Administratively Unreachable"; + case ICMP6_DST_UNREACH_BEYONDSCOPE: + return "Destination Unreachable Beyond Scope"; + case ICMP6_DST_UNREACH_ADDR: + return "Destination Address Unreachable"; + case ICMP6_DST_UNREACH_NOPORT: + return "Destination Port Unreachable"; + default: + crm_err("Unreachable: Unkown subtype: %d", code); + return "Unreachable: Unkown Subtype"; + } + case ICMP6_PACKET_TOO_BIG: + return "Packet too big"; + case ICMP6_TIME_EXCEEDED: + switch(code) { + case ICMP6_TIME_EXCEED_TRANSIT: + return "Time to live exceeded"; + case ICMP6_TIME_EXCEED_REASSEMBLY: + return "Frag reassembly time exceeded"; + default: + crm_err("Timeout: Unkown subtype: %d", code); + return "Timeout: Unkown Subtype"; + } + case ICMP6_PARAM_PROB: + switch(code) { + case ICMP6_PARAMPROB_HEADER: + return "Parameter problem: Erroneous Header"; + case ICMP6_PARAMPROB_NEXTHEADER: + return "Parameter problem: Unknown Nextheader"; + case ICMP6_PARAMPROB_OPTION: + return "Parameter problem: Unrecognized Option"; + default: + crm_err("Invalid header: Unkown subtype: %d", code); + return "Invalid header: Unkown Subtype"; + } + case ICMP6_ECHO_REQUEST: + return "Echo Request"; + case ICMP6_ECHO_REPLY: + return "Echo Reply"; + case MLD_LISTENER_QUERY: + return "Multicast Listener Query"; + case MLD_LISTENER_REPORT: + return "Multicast Listener Report"; + case MLD_LISTENER_REDUCTION: + return "Multicast Listener Done"; + case ND_ROUTER_SOLICIT: + return "Router Solicitation"; + case ND_ROUTER_ADVERT: + return "Router Advertisement"; + case ND_NEIGHBOR_SOLICIT: + return "Neighbor Solicitation"; + case ND_NEIGHBOR_ADVERT: + return "Neighbor Advertisement"; + case ND_REDIRECT: + return "Redirect"; + case ICMP6_ROUTER_RENUMBERING: + return "Router renumbering"; + default: + crm_err("Unknown type: %d", type); + return "Unknown type"; + } + } else { + switch(type) { + case ICMP_ECHOREPLY: + return "Echo Reply"; + case ICMP_ECHO: + return "Echo Request"; + case ICMP_PARAMPROB: + return "Bad Parameter"; + case ICMP_SOURCEQUENCH: + return "Packet lost, slow down"; + case ICMP_TSTAMP: + return "Timestamp Request"; + case ICMP_TSTAMPREPLY: + return "Timestamp Reply"; + case ICMP_IREQ: + return "Information Request"; + case ICMP_IREQREPLY: + return "Information Reply"; + + case ICMP_UNREACH: + switch(code) { + case ICMP_UNREACH_NET: + return "Unreachable Network"; + case ICMP_UNREACH_HOST: + return "Unreachable Host"; + case ICMP_UNREACH_PROTOCOL: + return "Unreachable Protocol"; + case ICMP_UNREACH_PORT: + return "Unreachable Port"; + case ICMP_UNREACH_NEEDFRAG: + return "Unreachable: Fragmentation needed"; + case ICMP_UNREACH_SRCFAIL: + return "Unreachable Source Route"; + case ICMP_UNREACH_NET_UNKNOWN: + return "Unknown Network"; + case ICMP_UNREACH_HOST_UNKNOWN: + return "Unknown Host"; + case ICMP_UNREACH_ISOLATED: + return "Unreachable: Isolated"; + case ICMP_UNREACH_NET_PROHIB: + return "Prohibited network"; + case ICMP_UNREACH_HOST_PROHIB: + return "Prohibited host"; + case ICMP_UNREACH_FILTER_PROHIB: + return "Unreachable: Prohibited filter"; + case ICMP_UNREACH_TOSNET: + return "Unreachable: Type of Service and Network"; + case ICMP_UNREACH_TOSHOST: + return "Unreachable: Type of Service and Host"; + case ICMP_UNREACH_HOST_PRECEDENCE: + return "Unreachable: Prec vio"; + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + return "Unreachable: Prec cutoff"; + default: + crm_err("Unreachable: Unknown subtype: %d", code); + return "Unreachable: Unknown Subtype"; + } + break; + + case ICMP_REDIRECT: + switch(code) { + case ICMP_REDIRECT_NET: + return "Redirect: Network"; + case ICMP_REDIRECT_HOST: + return "Redirect: Host"; + case ICMP_REDIRECT_TOSNET: + return "Redirect: Type of Service and Network"; + case ICMP_REDIRECT_TOSHOST: + return "Redirect: Type of Service and Host"; + default: + crm_err("Redirect: Unknown subtype: %d", code); + return "Redirect: Unknown Subtype"; + } + + case ICMP_TIMXCEED: + switch(code) { + case ICMP_TIMXCEED_INTRANS: + return "Timeout: TTL"; + case ICMP_TIMXCEED_REASS: + return "Timeout: Fragmentation reassembly"; + default: + crm_err("Timeout: Unkown subtype: %d", code); + return "Timeout: Unkown Subtype"; + } + break; + + default: + crm_err("Unknown type: %d", type); + return "Unknown type"; + } + } +} + +#ifdef ON_LINUX +# define MAX_HOST 1024 +int process_icmp6_error(ping_node *node, struct sockaddr_in6 *whereto) +{ + int rc = 0; + char buf[512]; + struct iovec iov; + struct msghdr msg; + struct icmp6_hdr icmph; + struct sockaddr_in6 target; + struct cmsghdr *cmsg = NULL; + struct sock_extended_err *s_err = NULL; + + iov.iov_base = &icmph; + iov.iov_len = sizeof(icmph); + msg.msg_name = (void*)⌖ + msg.msg_namelen = sizeof(target); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + rc = recvmsg(node->fd, &msg, MSG_ERRQUEUE|MSG_DONTWAIT); + if (rc < 0 || rc < sizeof(icmph)) { + crm_perror(LOG_DEBUG, "No error message: %d", rc); + return 0; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_RECVERR) { + s_err = (struct sock_extended_err *)CMSG_DATA(cmsg); + } + } + + CRM_ASSERT(s_err != NULL); + + if (s_err->ee_origin == SO_EE_ORIGIN_LOCAL) { + if (s_err->ee_errno == EMSGSIZE) { + crm_info("local error: Message too long, mtu=%u", s_err->ee_info); + } else { + crm_info("local error: %s", strerror(s_err->ee_errno)); + } + return 0; + + } else if (s_err->ee_origin == SO_EE_ORIGIN_ICMP6) { + struct sockaddr_in6 *sin = (struct sockaddr_in6*)(s_err+1); + const char *ping_result = ping_desc(node->type, s_err->ee_type, s_err->ee_code); + static char target_s[64], whereto_s[64], ping_host_s[64]; + inet_ntop(AF_INET6, (struct in6_addr *)&(target.sin6_addr), target_s, sizeof(target_s)); + inet_ntop(AF_INET6, (struct in6_addr *)&(whereto->sin6_addr), whereto_s, sizeof(whereto_s)); + + if (ntohs(icmph.icmp6_id) != ident) { + /* Result was not for us */ + crm_debug("Not our error (ident): %d %d", ntohs(icmph.icmp6_id), ident); + return -1; + + } else if (memcmp(&target.sin6_addr, &whereto->sin6_addr, 16)) { + /* Result was not for us */ + crm_debug("Not our error (addr): %s %s", target_s, whereto_s); + return -1; + + } else if (icmph.icmp6_type != ICMP6_ECHO_REQUEST) { + /* Not an error */ + crm_info("Not an error: %d", icmph.icmp6_type); + return -1; + } + + inet_ntop(AF_INET6, (struct in6_addr *)&(sin->sin6_addr), ping_host_s, sizeof(ping_host_s)); + crm_debug("From %s icmp_seq=%u %s", ping_host_s, ntohs(icmph.icmp6_seq), ping_result); + + } else { + crm_debug("else: %d", s_err->ee_origin); + } + + return 0; +} + +int process_icmp4_error(ping_node *node, struct sockaddr_in *whereto) +{ + int rc = 0; + char buf[512]; + struct iovec iov; + struct msghdr msg; + struct icmphdr icmph; + struct sockaddr_in target; + struct cmsghdr *cmsg = NULL; + struct sock_extended_err *s_err = NULL; + + iov.iov_base = &icmph; + iov.iov_len = sizeof(icmph); + msg.msg_name = (void*)⌖ + msg.msg_namelen = sizeof(target); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + rc = recvmsg(node->fd, &msg, MSG_ERRQUEUE|MSG_DONTWAIT); + if (rc < 0 || rc < sizeof(icmph)) { + crm_perror(LOG_DEBUG, "No error message: %d", rc); + return 0; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_RECVERR) { + s_err = (struct sock_extended_err *)CMSG_DATA(cmsg); + } + } + + CRM_ASSERT(s_err != NULL); + + if (s_err->ee_origin == SO_EE_ORIGIN_LOCAL) { + if (s_err->ee_errno == EMSGSIZE) { + crm_info("local error: Message too long, mtu=%u", s_err->ee_info); + } else { + crm_info("local error: %s", strerror(s_err->ee_errno)); + } + return 0; + + } else if (s_err->ee_origin == SO_EE_ORIGIN_ICMP) { + char ping_host[MAX_HOST]; + struct sockaddr_in *sin = (struct sockaddr_in*)(s_err+1); + const char *ping_result = ping_desc(node->type, s_err->ee_type, s_err->ee_code); + char *target_s = inet_ntoa(*(struct in_addr *)&(target.sin_addr.s_addr)); + char *whereto_s = inet_ntoa(*(struct in_addr *)&(whereto->sin_addr.s_addr)); + + if (ntohs(icmph.un.echo.id) != ident) { + /* Result was not for us */ + crm_debug("Not our error (ident): %d %d", ntohs(icmph.un.echo.id), ident); + return -1; + + } else if (safe_str_neq(target_s, whereto_s)) { + /* Result was not for us */ + crm_debug("Not our error (addr): %s %s", target_s, whereto_s); + return -1; + + } else if (icmph.type != ICMP_ECHO) { + /* Not an error */ + crm_info("Not an error: %d", icmph.type); + return -1; + } + + /* snprintf(ping_host, MAX_HOST, "%s", inet_ntoa(*(struct in_addr *)&(sin->sin_addr.s_addr))); */ + snprintf(ping_host, MAX_HOST, "%s", inet_ntoa(sin->sin_addr)); + + if (node->extra_filters == FALSE) { + /* Now that we got some sort of reply, add extra filters to + * ensure we keep getting the _right_ replies for dead hosts + */ + struct icmp_filter filt; + crm_debug("Installing additional ICMP filters"); + node->extra_filters = TRUE; /* only try once */ + + filt.data = ~((1<fd, SOL_RAW, ICMP_FILTER, (char*)&filt, sizeof(filt)) == -1) { + crm_perror(LOG_WARNING, "setsockopt failed: Cannot install ICMP filters for %s", ping_host); + } + } + + crm_debug("From %s icmp_seq=%u %s", ping_host, ntohs(icmph.un.echo.sequence), ping_result); + + } else { + crm_debug("else: %d", s_err->ee_origin); + } + + return 0; +} +#else +int process_icmp6_error(ping_node *node, struct sockaddr_in6 *whereto) +{ + /* dummy function */ + return 0; +} + +int process_icmp4_error(ping_node *node, struct sockaddr_in *whereto) +{ + /* dummy function */ + return 0; +} +#endif + +static ping_node *ping_new(const char *host) +{ + ping_node *node; + + crm_malloc0(node, sizeof(ping_node)); + + if(strstr(host, ":")) { + node->type = AF_INET6; + } else { + node->type = AF_INET; + } + + node->host = crm_strdup(host); + + return node; +} + +static gboolean ping_open(ping_node *node) +{ + int ret_ga = 0; + char *hostname = NULL; + struct addrinfo *res = NULL; + struct addrinfo hints; + char *addr = NULL; + char *cp = NULL; + + /* getaddrinfo */ + bzero(&hints, sizeof(struct addrinfo)); + hints.ai_flags = AI_CANONNAME; + hints.ai_family = node->type; + hints.ai_socktype = SOCK_RAW; + + if(node->type == AF_INET6) { + hints.ai_protocol = IPPROTO_ICMPV6; + } else { + hints.ai_protocol = IPPROTO_ICMP; + } + + addr = crm_strdup(node->host); + if ((cp = strchr(addr, '%'))) { + *cp = 0; + } + crm_debug("node->host[%s], addr[%s]", node->host, addr); + ret_ga = getaddrinfo(addr, NULL, &hints, &res); + crm_free(addr); + if (ret_ga) { + crm_warn("getaddrinfo: %s", gai_strerror(ret_ga)); + goto bail; + } + + if (res->ai_canonname) { + hostname = res->ai_canonname; + } else { + hostname = node->host; + } + + crm_debug_2("Got address %s for %s", hostname, node->host); + + if(!res->ai_addr) { + crm_warn("getaddrinfo failed: no address"); + goto bail; + } + + memcpy(&(node->addr.raw), res->ai_addr, res->ai_addrlen); + node->fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol); + /* node->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); */ + + if(node->fd < 0) { + crm_perror(LOG_WARNING, "Can't open socket to %s", hostname); + goto bail; + } + + if(node->type == AF_INET6) { + int sockopt; + + inet_ntop(node->type, &node->addr.v6.sin6_addr, node->dest, sizeof(node->dest)); + + /* set recv buf for broadcast pings */ + sockopt = 48 * 1024; + setsockopt(node->fd, SOL_SOCKET, SO_RCVBUF, (char *) &sockopt, sizeof(sockopt)); + + } else { + inet_ntop(node->type, &node->addr.v4.sin_addr, node->dest, sizeof(node->dest)); + } + + if(ping_timeout > 0) { + struct timeval timeout_opt; + + timeout_opt.tv_sec = ping_timeout; + timeout_opt.tv_usec = 0; + + setsockopt(node->fd, SOL_SOCKET, SO_RCVTIMEO, (char *) &timeout_opt, sizeof(timeout_opt)); + } + +#ifdef ON_LINUX + { + int dummy = 1; + + memset(&cmsgbuf, 0, sizeof(cmsgbuf)); + cmsglen = 0; + + if(node->type == AF_INET6) { + struct icmp6_filter filt; + + ICMP6_FILTER_SETBLOCKALL(&filt); + ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filt); + + if (setsockopt(node->fd, IPPROTO_ICMPV6, ICMP6_FILTER, (char*)&filt, sizeof(filt)) == -1) { + crm_perror(LOG_WARNING, "setsockopt failed: Cannot install ICMP6 filters for %s", node->dest); + } + setsockopt(node->fd, SOL_IPV6, IPV6_RECVERR, (char *)&dummy, sizeof(dummy)); + + if ((cp = strchr(node->host, '%'))) { + struct ifreq ifr; + struct cmsghdr *cmsg; + struct in6_pktinfo *ipi; + + memset(&ifr, 0, sizeof(ifr)); + cp++; + crm_debug("set interface: [%s]", cp); + strncpy(ifr.ifr_name, cp, IFNAMSIZ-1); + + if (ioctl(node->fd, SIOCGIFINDEX, &ifr) >= 0) { + cmsg = (struct cmsghdr*)cmsgbuf; + cmsglen = CMSG_SPACE(sizeof(*ipi)); + cmsg->cmsg_len = CMSG_LEN(sizeof(*ipi)); + cmsg->cmsg_level = SOL_IPV6; + cmsg->cmsg_type = IPV6_PKTINFO; + + ipi = (struct in6_pktinfo*)CMSG_DATA(cmsg); + memset(ipi, 0, sizeof(*ipi)); + ipi->ipi6_ifindex = ifr.ifr_ifindex; + } else { + crm_warn("unknown interface %s specified", cp); + } + } + } else { + struct icmp_filter filt; + filt.data = ~((1<fd, SOL_RAW, ICMP_FILTER, (char*)&filt, sizeof(filt)) == -1) { + crm_perror(LOG_WARNING, "setsockopt failed: Cannot install ICMP filters for %s", node->dest); + } + setsockopt(node->fd, SOL_IP, IP_RECVERR, (char *)&dummy, sizeof(dummy)); + + if ((cp = strchr(node->host, '%'))) { + struct ifreq ifr; + struct cmsghdr *cmsg; + struct in_pktinfo *ipi; + + memset(&ifr, 0, sizeof(ifr)); + cp++; + crm_debug("set interface: [%s]", cp); + strncpy(ifr.ifr_name, cp, IFNAMSIZ-1); + + if (ioctl(node->fd, SIOCGIFINDEX, &ifr) >= 0) { + cmsg = (struct cmsghdr*)cmsgbuf; + cmsglen = CMSG_SPACE(sizeof(*ipi)); + cmsg->cmsg_len = CMSG_LEN(sizeof(*ipi)); + cmsg->cmsg_level = SOL_IP; + cmsg->cmsg_type = IP_PKTINFO; + + ipi = (struct in_pktinfo*)CMSG_DATA(cmsg); + memset(ipi, 0, sizeof(*ipi)); + ipi->ipi_ifindex = ifr.ifr_ifindex; + } else { + crm_warn("unknown interface %s specified", cp); + } + } + } + } +#endif + + crm_debug_2("Opened connection to %s", node->dest); + freeaddrinfo(res); + return TRUE; + + bail: + if(res) { + freeaddrinfo(res); + } + return FALSE; +} + +static gboolean ping_close(ping_node *node) +{ + int tmp_fd = node->fd; + node->fd = -1; + + if (tmp_fd >= 0) { + if(close(tmp_fd) < 0) { + crm_perror(LOG_ERR,"Could not close ping socket"); + } else { + tmp_fd = -1; + crm_debug_2("Closed connection to %s", node->dest); + } + } + return (tmp_fd == -1); +} + +#define MAXPACKETLEN 131072 +#define ICMP6ECHOLEN 8 /* icmp echo header len excluding time */ +#define ICMP6ECHOTMLEN 20 +#define DEFDATALEN ICMP6ECHOTMLEN +#define EXTRA 256 /* for AH and various other headers. weird. */ +#define IP6LEN 40 + +static int +dump_v6_echo(ping_node *node, u_char *buf, int bytes, struct msghdr *hdr) +{ + int rc = -1; /* Try again */ + int fromlen; + char from_host[1024]; + + struct icmp6_hdr *icp; + struct sockaddr *from; + + if (!hdr || !hdr->msg_name || hdr->msg_namelen != sizeof(struct sockaddr_in6) + || ((struct sockaddr *)hdr->msg_name)->sa_family != AF_INET6) { + crm_warn("Invalid echo peer"); + return rc; + } + + fromlen = hdr->msg_namelen; + from = (struct sockaddr *)hdr->msg_name; + getnameinfo(from, fromlen, from_host, sizeof(from_host), NULL, 0, NI_NUMERICHOST | NI_NUMERICSERV); + + if (bytes < (int)sizeof(struct icmp6_hdr)) { + crm_warn("Invalid echo packet (too short: %d bytes) from %s", bytes, from_host); + return rc; + } + icp = (struct icmp6_hdr *)buf; + + if (icp->icmp6_type == ICMP6_ECHO_REPLY) { + if (ident == ntohs(icp->icmp6_id) + && node->iseq == ntohs(icp->icmp6_seq)) { + rc = 1; /* Alive */ + } + + } else if(icp->icmp6_type != ICMP6_ECHO_REQUEST) { + rc = process_icmp6_error(node, (struct sockaddr_in6*)&(node->addr)); + } + + do_crm_log(LOG_DEBUG_2, + "Echo from %s (exp=%d, seq=%d, id=%d, dest=%s, data=%s): %s", + from_host, node->iseq, ntohs(icp->icmp6_seq), + ntohs(icp->icmp6_id), node->dest, (char*)(buf + ICMP6ECHOLEN), + ping_desc(node->type, icp->icmp6_type, icp->icmp6_code)); + + return rc; +} + +static int +dump_v4_echo(ping_node *node, u_char *buf, int bytes, struct msghdr *hdr) +{ + int rc = -1; /* Try again */ + int iplen, fromlen; + char from_host[1024]; + + struct ip *ip; + struct icmp *icp; + struct sockaddr *from; + + if (hdr == NULL + || !hdr->msg_name + || hdr->msg_namelen != sizeof(struct sockaddr_in) + || ((struct sockaddr *)hdr->msg_name)->sa_family != AF_INET) { + crm_warn("Invalid echo peer"); + return rc; + } + + fromlen = hdr->msg_namelen; + from = (struct sockaddr *)hdr->msg_name; + getnameinfo(from, fromlen, from_host, sizeof(from_host), NULL, 0, NI_NUMERICHOST | NI_NUMERICSERV); + + ip = (struct ip*)buf; + iplen = ip->ip_hl * 4; + + if (bytes < (iplen + sizeof(struct icmp))) { + crm_warn("Invalid echo packet (too short: %d bytes) from %s", bytes, from_host); + return rc; + } + + /* Check the IP header */ + icp = (struct icmp*)(buf + iplen); + + if (icp->icmp_type == ICMP_ECHOREPLY) { + if (ident == ntohs(icp->icmp_id) + && node->iseq == ntohs(icp->icmp_seq)) { + rc = 1; /* Alive */ + } + + } else if(icp->icmp_type != ICMP_ECHO) { + rc = process_icmp4_error(node, (struct sockaddr_in*)from); + } + + /* TODO: Stop logging icmp_id once we're sure everything works */ + do_crm_log(LOG_DEBUG_2, + "Echo from %s (exp=%d, seq=%d, id=%d, dest=%s, data=%s): %s", + from_host, node->iseq, ntohs(icp->icmp_seq), + ntohs(icp->icmp_id), node->dest, icp->icmp_data, + ping_desc(node->type, icp->icmp_type, icp->icmp_code)); + + return rc; +} + +static int +ping_read(ping_node *node, int *lenp) +{ + int bytes; + char fromaddr[128]; + struct msghdr m; + struct cmsghdr *cm; + u_char buf[1024]; + struct iovec iov[2]; + int saved_errno = 0; + + struct timeval recv_start_time; + struct timeval recv_time; + int packlen; + u_char *packet; + packlen = DEFDATALEN + IP6LEN + ICMP6ECHOLEN + EXTRA; + gettimeofday(&recv_start_time, NULL); + + crm_malloc0(packet, packlen); + + retry: + m.msg_name = &fromaddr; + m.msg_namelen = sizeof(fromaddr); + memset(&iov, 0, sizeof(iov)); + iov[0].iov_base = (caddr_t)packet; + iov[0].iov_len = packlen; + m.msg_iov = iov; + m.msg_iovlen = 1; + cm = (struct cmsghdr *)buf; + m.msg_control = (caddr_t)buf; + m.msg_controllen = sizeof(buf); + + + bytes = recvmsg(node->fd, &m, 0); + saved_errno = errno; + crm_debug_2("Got %d bytes", bytes); + + if(bytes < 0) { + crm_perror(LOG_DEBUG, "Read failed"); + if (saved_errno != EAGAIN && saved_errno != EINTR) { + int rc = 0; + if(node->type == AF_INET6) { + rc = process_icmp6_error(node, (struct sockaddr_in6*)&(node->addr)); + } else { + rc = process_icmp4_error(node, (struct sockaddr_in*)&fromaddr); + } + + if(rc < 0) { + crm_info("Retrying..."); + goto retry; + } + } + + } else if (bytes > 0) { + int rc = 0; + if(node->type == AF_INET6) { + rc = dump_v6_echo(node, packet, bytes, &m); + } else { + rc = dump_v4_echo(node, packet, bytes, &m); + } + + gettimeofday(&recv_time, NULL); + if ((recv_start_time.tv_sec + ping_timeout) < recv_time.tv_sec) { + crm_warn("failed to receive for timeout."); + crm_free(packet); + return FALSE; + } + + if(rc < 0) { + crm_debug_2("Retrying..."); + goto retry; + + } else if(rc > 0) { + crm_free(packet); + return TRUE; + } + + } else { + crm_err("Unexpected reply"); + } + + crm_free(packet); + return FALSE; +} + +static int +ping_write(ping_node *node, const char *data, size_t size) +{ + struct iovec iov; + int rc, bytes, namelen; + /* static int ntransmitted = 9; */ + struct msghdr smsghdr; + u_char outpack[MAXPACKETLEN]; + memset(outpack, 0, MAXPACKETLEN); + + node->iseq++; + + if(node->type == AF_INET6) { + struct icmp6_hdr *icp; + namelen = sizeof(struct sockaddr_in6); + bytes = ICMP6ECHOLEN + DEFDATALEN; + + icp = (struct icmp6_hdr *)outpack; + + icp->icmp6_code = 0; + icp->icmp6_cksum = 0; + icp->icmp6_type = ICMP6_ECHO_REQUEST; + icp->icmp6_id = htons(ident); + icp->icmp6_seq = htons(node->iseq); + + /* Sanity check */ + if(ntohs(icp->icmp6_seq) != node->iseq) { + crm_debug("Wrapping at %u", node->iseq); + node->iseq = ntohs(icp->icmp6_seq); + } + + memcpy(&outpack[ICMP6ECHOLEN], "pingd-v6", 8); + + } else { + struct icmp *icp; + namelen = sizeof(struct sockaddr_in); + bytes = sizeof(struct icmp) + 11; + + icp = (struct icmp *)outpack; + + icp->icmp_code = 0; + icp->icmp_cksum = 0; + icp->icmp_type = ICMP_ECHO; + icp->icmp_id = htons(ident); + icp->icmp_seq = htons(node->iseq); + + /* Sanity check */ + if(ntohs(icp->icmp_seq) != node->iseq) { + crm_debug("Wrapping at %u", node->iseq); + node->iseq = ntohs(icp->icmp_seq); + } + + memcpy(icp->icmp_data, "pingd-v4", 8); + icp->icmp_cksum = in_cksum((u_short *)icp, bytes); + } + + memset(&iov, 0, sizeof(struct iovec)); + memset(&smsghdr, 0, sizeof(struct msghdr)); + + smsghdr.msg_name = (caddr_t)&(node->addr); + smsghdr.msg_namelen = namelen; + iov.iov_base = (caddr_t)outpack; + iov.iov_len = bytes; + smsghdr.msg_iov = &iov; + smsghdr.msg_iovlen = 1; + smsghdr.msg_control = cmsgbuf; + smsghdr.msg_controllen = cmsglen; + + rc = sendmsg(node->fd, &smsghdr, 0); + + if (rc < 0 || rc != bytes) { + crm_perror(LOG_WARNING, "Wrote %d of %d chars", rc, bytes); + return FALSE; + } + + crm_debug_2("Sent %d bytes to %s", rc, node->dest); + return TRUE; +} + +static gboolean +pingd_shutdown(int nsig, gpointer unused) +{ + need_shutdown = TRUE; + send_update(0); + + if (mainloop != NULL && g_main_is_running(mainloop)) { + g_main_quit(mainloop); + } else { + exit(0); + } + return FALSE; +} + +static void +usage(const char *cmd, int exit_status) +{ + FILE *stream; + + stream = exit_status ? stderr : stdout; + + fprintf(stream, "usage: %s [-%s]\n", cmd, OPTARGS); + fprintf(stream, "\t--%s (-%c) \t\t\tThis text\n", "help", '?'); + fprintf(stream, "\t--%s (-%c) \t\tRun in daemon mode\n", "daemonize", 'D'); + fprintf(stream, "\t--%s (-%c) \tFile in which to store the process' PID\n" + "\t\t\t\t\t* Default=/tmp/pingd.pid\n", "pid-file", 'p'); + fprintf(stream, "\t--%s (-%c) \tName of the node attribute to set\n" + "\t\t\t\t\t* Default=pingd\n", "attr-name", 'a'); +/* invalid option + fprintf(stream, "\t--%s (-%c) \tName of the set in which to set the attribute\n" + "\t\t\t\t\t* Default=cib-bootstrap-options\n", "attr-set", 's'); + fprintf(stream, "\t--%s (-%c) \tWhich part of the CIB to put the attribute in\n" + "\t\t\t\t\t* Default=status\n", "attr-section", 'S'); + fprintf(stream, "\t--%s (-%c) \t\tHow long to wait for no further changes to occur before updating the CIB with a changed attribute\n", "attr-dampen", 'd'); +*/ + fprintf(stream, "\t--%s (-%c) \tMonitor a subset of the ping nodes listed in ha.cf (can be specified multiple times)\n", "node", 'N'); + fprintf(stream, "\t--%s (-%c) \tFor every connected node, add to the value set in the CIB\n" + "\t\t\t\t\t\t* Default=1\n", "ping-multiplier", 'm'); + + fprintf(stream, "\t--%s (-%c) \t\tHow often, in seconds, to check for node liveliness (default=1)\n", "ping-interval", 'i'); + fprintf(stream, "\t--%s (-%c) \t\tNumber of ping attempts, per host, before declaring it dead (default=2)\n", "ping-attempts", 'n'); + fprintf(stream, "\t--%s (-%c) \t\tHow long, in seconds, to wait before declaring a ping lost (default=2)\n", "ping-timeout ", 't'); + fflush(stream); + + exit(exit_status); +} + +#if SUPPORT_HEARTBEAT +static gboolean +pingd_ha_dispatch(IPC_Channel *channel, gpointer user_data) +{ + gboolean stay_connected = TRUE; + + crm_debug_2("Invoked"); + + while(pingd_cluster != NULL && IPC_ISRCONN(channel)) { + if(pingd_cluster->llc_ops->msgready(pingd_cluster) == 0) { + crm_debug_2("no message ready yet"); + break; + } + /* invoke the callbacks but dont block */ + pingd_cluster->llc_ops->rcvmsg(pingd_cluster, 0); + } + + if (pingd_cluster == NULL || channel->ch_status != IPC_CONNECT) { + if(need_shutdown == FALSE) { + crm_crit("Lost connection to heartbeat service."); + } else { + crm_info("Lost connection to heartbeat service."); + } + stay_connected = FALSE; + } + + return stay_connected; +} + + +static void +pingd_ha_connection_destroy(gpointer user_data) +{ + crm_debug_3("Invoked"); + if(need_shutdown) { + /* we signed out, so this is expected */ + crm_info("Heartbeat disconnection complete"); + return; + } + + crm_crit("Lost connection to heartbeat service!"); +} + +static gboolean +register_with_ha(void) +{ + if(pingd_cluster == NULL) { + pingd_cluster = ll_cluster_new("heartbeat"); + } + if(pingd_cluster == NULL) { + crm_err("Cannot create heartbeat object"); + return FALSE; + } + + crm_debug("Signing in with Heartbeat"); + if (pingd_cluster->llc_ops->signon( + pingd_cluster, crm_system_name) != HA_OK) { + + crm_err("Cannot sign on with heartbeat: %s", + pingd_cluster->llc_ops->errmsg(pingd_cluster)); + crm_err("REASON: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster)); + return FALSE; + } + + do_node_walk(pingd_cluster); + + crm_debug_3("Be informed of Node Status changes"); + if (HA_OK != pingd_cluster->llc_ops->set_nstatus_callback( + pingd_cluster, pingd_nstatus_callback, NULL)) { + + crm_err("Cannot set nstatus callback: %s", + pingd_cluster->llc_ops->errmsg(pingd_cluster)); + crm_err("REASON: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster)); + return FALSE; + } + + if (pingd_cluster->llc_ops->set_ifstatus_callback( + pingd_cluster, pingd_lstatus_callback, NULL) != HA_OK) { + crm_err("Cannot set if status callback: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster)); + return FALSE; + } + + crm_debug_3("Adding channel to mainloop"); + G_main_add_IPC_Channel( + G_PRIORITY_HIGH, pingd_cluster->llc_ops->ipcchan( + pingd_cluster), + FALSE, pingd_ha_dispatch, pingd_cluster, + pingd_ha_connection_destroy); + + return TRUE; +} + +void +do_node_walk(ll_cluster_t *hb_cluster) +{ + const char *ha_node = NULL; + + /* Async get client status information in the cluster */ + crm_debug_2("Invoked"); + crm_debug_3("Requesting an initial dump of CRMD client_status"); + hb_cluster->llc_ops->client_status( + hb_cluster, NULL, CRM_SYSTEM_CRMD, -1); + + crm_info("Requesting the list of configured nodes"); + hb_cluster->llc_ops->init_nodewalk(hb_cluster); + + do { + const char *ha_node_type = NULL; + const char *ha_node_status = NULL; + + ha_node = hb_cluster->llc_ops->nextnode(hb_cluster); + if(ha_node == NULL) { + continue; + } + + ha_node_type = hb_cluster->llc_ops->node_type( + hb_cluster, ha_node); + if(safe_str_neq("ping", ha_node_type)) { + crm_debug("Node %s: skipping '%s'", + ha_node, ha_node_type); + continue; + } + + if(do_filter + && g_hash_table_lookup(ping_nodes, ha_node) == NULL) { + crm_debug("Filtering: %s", ha_node); + continue; + } + + ha_node_status = hb_cluster->llc_ops->node_status( + hb_cluster, ha_node); + + crm_debug("Adding: %s=%s", ha_node, ha_node_status); + g_hash_table_replace(ping_nodes, crm_strdup(ha_node), + crm_strdup(ha_node_status)); + + } while(ha_node != NULL); + + hb_cluster->llc_ops->end_nodewalk(hb_cluster); + crm_debug_2("Complete"); + send_update(-1); +} +#endif + +static gboolean stand_alone_ping(gpointer data) +{ + int num_active = 0; + + crm_debug_2("Checking connectivity"); + slist_iter( + ping, ping_node, ping_list, num, + + if(ping_open(ping)) { + int lpc = 0; + for(;lpc < pings_per_host; lpc++) { + int len = 0; + if(ping_write(ping, "test", 4) == FALSE) { + crm_info("Node %s is unreachable (write)", ping->host); + + } else if(ping_read(ping, &len)) { + crm_debug("Node %s is alive", ping->host); + num_active++; + break; + } else { + crm_info("Node %s is unreachable (read)", ping->host); + } + sleep(1); + } + } + + ping_close(ping); + ); + + send_update(num_active); + + return TRUE; +} + +int +main(int argc, char **argv) +{ + int argerr = 0; + int flag; + const char *pid_file = NULL; + gboolean daemonize = FALSE; + ping_node *p = NULL; + +#ifdef HAVE_GETOPT_H + int option_index = 0; + static struct option long_options[] = { + /* Top-level Options */ + {"verbose", 0, 0, 'V'}, + {"help", 0, 0, '?'}, + {"pid-file", 1, 0, 'p'}, + {"node", 1, 0, 'N'}, + {"ping-host", 1, 0, 'h'}, /* legacy */ + {"attr-name", 1, 0, 'a'}, + {"attr-set", 1, 0, 's'}, + {"daemonize", 0, 0, 'D'}, + {"attr-section", 1, 0, 'S'}, + {"attr-dampen", 1, 0, 'd'}, + {"no-updates", 0, 0, 'U'}, + {"ping-interval", 1, 0, 'i'}, + {"ping-attempts", 1, 0, 'n'}, + {"ping-timeout", 1, 0, 't'}, + {"ping-multiplier", 1, 0, 'm'}, + + /* Legacy */ + {"value-multiplier", 1, 0, 'm'}, + {"interval", 1, 0, 'i'}, + + {0, 0, 0, 0} + }; +#endif + pid_file = "/tmp/pingd.pid"; + + G_main_add_SignalHandler( + G_PRIORITY_HIGH, SIGTERM, pingd_shutdown, NULL, NULL); + + ping_nodes = g_hash_table_new_full( + g_str_hash, g_str_equal, + g_hash_destroy_str, g_hash_destroy_str); + + crm_log_init(basename(argv[0]), LOG_INFO, TRUE, FALSE, argc, argv); + + while (1) { +#ifdef HAVE_GETOPT_H + flag = getopt_long(argc, argv, OPTARGS, + long_options, &option_index); +#else + flag = getopt(argc, argv, OPTARGS); +#endif + if (flag == -1) + break; + + switch(flag) { + case 'V': + cl_log_enable_stderr(TRUE); + alter_debug(DEBUG_INC); + break; + case 'p': + pid_file = optarg; + break; + case 'a': + pingd_attr = optarg; + break; + case 'N': + case 'h': + stand_alone = TRUE; + crm_debug("Adding ping host %s", optarg); + p = ping_new(optarg); + ping_list = g_list_append(ping_list, p); + break; + case 's': + attr_set = crm_strdup(optarg); + break; + case 'm': + attr_multiplier = crm_parse_int(optarg, "1"); + break; + case 'S': + attr_section = crm_strdup(optarg); + break; + case 'd': + attr_dampen = crm_get_msec(optarg); + break; + case 'i': + re_ping_interval = crm_get_msec(optarg); + break; + case 'n': + pings_per_host = crm_atoi(optarg, NULL); + break; + case 't': + ping_timeout = crm_atoi(optarg, NULL); + break; + case 'D': + daemonize = TRUE; + break; + case 'U': + cl_log_enable_stderr(TRUE); + do_updates = FALSE; + break; + case '?': + usage(crm_system_name, LSB_EXIT_GENERIC); + break; + default: + printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + crm_err("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); + ++argerr; + break; + } + } + + if (optind < argc) { + crm_err("non-option ARGV-elements: "); + printf("non-option ARGV-elements: "); + while (optind < argc) { + crm_err("%s ", argv[optind]); + printf("%s ", argv[optind]); + optind++; + } + printf("\n"); + } + if (argerr) { + usage(crm_system_name, LSB_EXIT_GENERIC); + } + + crm_make_daemon(crm_system_name, daemonize, pid_file); + ident = getpid(); + + if(do_updates == FALSE) { + goto start_ping; + } + +#if SUPPORT_AIS + if(is_openais_cluster()) { + stand_alone = TRUE; + } +#endif + +#if SUPPORT_HEARTBEAT + if(stand_alone == FALSE && register_with_ha() == FALSE) { + crm_err("HA registration failed"); + cl_flush_logs(); + exit(LSB_EXIT_GENERIC); + } +#endif + start_ping: + if(stand_alone && ping_list == NULL) { + crm_err("You must specify a list of hosts to monitor"); + exit(LSB_EXIT_GENERIC); + } + + crm_info("Starting %s", crm_system_name); + mainloop = g_main_new(FALSE); + + if(stand_alone) { + stand_alone_ping(NULL); + g_timeout_add(re_ping_interval, stand_alone_ping, NULL); + } + + g_main_run(mainloop); + + g_hash_table_destroy(ping_nodes); + slist_destroy(ping_node, p, ping_list, + crm_free(p->host); + crm_free(p); + ); + crm_info("Exiting %s", crm_system_name); + return 0; +} + + +static void count_ping_nodes(gpointer key, gpointer value, gpointer user_data) +{ + int *num_active = user_data; + CRM_CHECK(num_active != NULL, return); + + if(need_shutdown) { + return; + } + + if(safe_str_eq(value, "ping")) { + (*num_active)++; + } else if(safe_str_eq(value, "up")) { + (*num_active)++; + } +} + +void +send_update(int num_active) +{ + int lpc = 0; + static IPC_Channel *attrd = NULL; + HA_Message *update = ha_msg_new(4); + ha_msg_add(update, F_TYPE, "hb_vmmonitor"); + ha_msg_add(update, F_ORIG, crm_system_name); + ha_msg_add(update, F_ATTRD_TASK, "update"); + ha_msg_add(update, F_ATTRD_ATTRIBUTE, pingd_attr); + + if(num_active < 0) { + num_active = 0; + g_hash_table_foreach(ping_nodes, count_ping_nodes, &num_active); + } + + ha_msg_add_int(update, F_ATTRD_VALUE, attr_multiplier*num_active); + + if(attr_set != NULL) { + ha_msg_add(update, F_ATTRD_SET, attr_set); + } + if(attr_section != NULL) { + ha_msg_add(update, F_ATTRD_SECTION, attr_section); + } + if(attr_dampen > 0) { + ha_msg_add_int(update, F_ATTRD_DAMPEN, attr_dampen/1000); + } + + if(do_updates && attrd == NULL) { + crm_info("Attempting hb_vmmonitor (re-)registration"); + attrd = init_client_ipc_comms_nodispatch("hb_vmmonitor"); + for(lpc = 0; attrd == NULL && lpc < 10; lpc++) { + sleep(1); + crm_debug("hb_vmmonitor registration attempt: %d", lpc); + attrd = init_client_ipc_comms_nodispatch("hb_vmmonitor"); + } + } + + if(do_updates == FALSE) { + char *buf = NULL; + if ((buf = dump_xml_formatted(update))) { + if ((strlen(buf)>0) && (buf[strlen(buf)-1]=='\n')) { + buf[strlen(buf)-1] = 0; + } + crm_info("%s", buf); + crm_free(buf); + } + + } else if(attrd == NULL) { + crm_err("Could not send update: %s=%d (Not connected)", pingd_attr, attr_multiplier*num_active); + + } else if(send_ipc_message(attrd, update) == FALSE) { + crm_err("Could not send update: %s=%d (Connection failed)", pingd_attr, attr_multiplier*num_active); + attrd = NULL; + } else { + crm_debug("Sent update: %s=%d (%d active ping nodes)", pingd_attr, attr_multiplier*num_active, num_active); + } + crm_msg_del(update); +} + +void +pingd_nstatus_callback( + const char *node, const char * status, void* private_data) +{ + crm_notice("Status update: Ping node %s now has status [%s]", + node, status); + + if(g_hash_table_lookup(ping_nodes, node) != NULL) { + g_hash_table_replace( + ping_nodes, crm_strdup(node), crm_strdup(status)); + send_update(-1); + } +} + +void +pingd_lstatus_callback(const char *node, const char *lnk, const char *status, + void *private) +{ + crm_notice("Status update: Ping node %s now has status [%s]", + node, status); + pingd_nstatus_callback(node, status, private); +} +