[Nagios-checkins] SF.net SVN: nagios:[2044] nagioscore/trunk

ageric at users.sourceforge.net ageric at users.sourceforge.net
Thu Aug 2 00:47:44 UTC 2012


Revision: 2044
          http://nagios.svn.sourceforge.net/nagios/?rev=2044&view=rev
Author:   ageric
Date:     2012-08-02 00:47:44 +0000 (Thu, 02 Aug 2012)
Log Message:
-----------
core: Retain workers between reloads

Respawning them for reloads makes no sense, since it just means we'll
lose a lot of half-completed checks for no good reason.

While we're at it, we fix worker process memory management so they
can safely release each other's memory.

Signed-off-by: Andreas Ericsson <ae at op5.se>

Modified Paths:
--------------
    nagioscore/trunk/base/nagios.c
    nagioscore/trunk/base/workers.c
    nagioscore/trunk/include/workers.h
    nagioscore/trunk/lib/worker.h

Modified: nagioscore/trunk/base/nagios.c
===================================================================
--- nagioscore/trunk/base/nagios.c	2012-08-02 00:47:22 UTC (rev 2043)
+++ nagioscore/trunk/base/nagios.c	2012-08-02 00:47:44 UTC (rev 2044)
@@ -268,6 +268,7 @@
 unsigned long   max_debug_file_size = DEFAULT_MAX_DEBUG_FILE_SIZE;
 
 
+extern iobroker_set *nagios_iobs;
 
 int main(int argc, char **argv, char **env) {
 	int result;
@@ -626,6 +627,8 @@
 	/* else start to monitor things... */
 	else {
 
+		nagios_iobs = iobroker_create();
+
 		/* keep monitoring things until we get a shutdown command */
 		do {
 
@@ -872,6 +875,8 @@
 
 			/* shutdown stuff... */
 			if(sigshutdown == TRUE) {
+				free_worker_memory(WPROC_FORCE);
+				iobroker_destroy(nagios_iobs, IOBROKER_CLOSE_SOCKETS);
 
 				/* make sure lock file has been removed - it may not have been if we received a shutdown command */
 				if(daemon_mode == TRUE)

Modified: nagioscore/trunk/base/workers.c
===================================================================
--- nagioscore/trunk/base/workers.c	2012-08-02 00:47:22 UTC (rev 2043)
+++ nagioscore/trunk/base/workers.c	2012-08-02 00:47:44 UTC (rev 2044)
@@ -27,7 +27,6 @@
 	char *contact_name;
 	char *host_name;
 	char *service_description;
-	struct squeue_event *sq_evt;
 } wproc_object_job;
 
 typedef struct wproc_result {
@@ -50,6 +49,8 @@
 	struct kvvec *response;
 } wproc_result;
 
+extern int nagios_pid;
+
 #define tv2float(tv) ((float)((tv)->tv_sec) + ((float)(tv)->tv_usec) / 1000000.0)
 
 static worker_job *create_job(int type, void *arg, time_t timeout, const char *command)
@@ -139,51 +140,93 @@
 	my_free(job->command);
 
 	wp->jobs[job->id % wp->max_jobs] = NULL;
+	wp->jobs_running--;
+
 	free(job);
 }
 
-static void free_wproc_memory(worker_process *wp)
+static int wproc_is_alive(worker_process *wp)
 {
-	int i = 0, destroyed = 0;
+	if (!wp || !wp->pid)
+		return 0;
+	if (kill(wp->pid, 0) == 0 && iobroker_is_registered(nagios_iobs, wp->sd))
+		return 1;
+	return 0;
+}
 
+int wproc_destroy(worker_process *wp, int flags)
+{
+	int i = 0, destroyed = 0, force = 0, sd, self;
+
 	if (!wp)
-		return;
+		return 0;
 
+	force = !!(flags & WPROC_FORCE);
+
+	self = getpid();
+
+	/* master retains workers through restarts */
+	if (self == nagios_pid && !force)
+		return 0;
+
+	/* free all memory when either forcing or a worker called us */
 	iocache_destroy(wp->ioc);
 	wp->ioc = NULL;
+	if (wp->jobs) {
+		for (i = 0; i < wp->max_jobs; i++) {
+			if (!wp->jobs[i])
+				continue;
 
-	for (i = 0; i < wp->max_jobs; i++) {
-		if (!wp->jobs[i])
-			continue;
+			destroy_job(wp, wp->jobs[i]);
+			/* we can (often) break out early */
+			if (++destroyed >= wp->jobs_running)
+				break;
+		}
 
-		destroy_job(wp, wp->jobs[i]);
-		/* we can (often) break out early */
-		if (++destroyed >= wp->jobs_running)
-			break;
+		/* this triggers a double-free() for some reason */
+		/* free(wp->jobs); */
+		wp->jobs = NULL;
 	}
+	sd = wp->sd;
+	free(wp);
 
-	free(wp->jobs);
+	/* workers must never control other workers, so they return early */
+	if (self != nagios_pid)
+		return 0;
+
+	/* kill(0, SIGKILL) equals suicide, so we avoid it */
+	if (wp->pid) {
+		kill(wp->pid, SIGKILL);
+	}
+
+	iobroker_close(nagios_iobs, sd);
+
+	/* reap our possibly lost children */
+	while (waitpid(-1, &i, WNOHANG) > 0)
+		; /* do nothing */
+
+	return 0;
 }
 
 /*
  * This gets called from both parent and worker process, so
  * we must take care not to blindly shut down everything here
  */
-void free_worker_memory(void)
+void free_worker_memory(int flags)
 {
-	unsigned int i;
+	if (workers) {
+		unsigned int i;
 
-	for (i = 0; i < num_workers; i++) {
-		if (!workers[i])
-			continue;
+		for (i = 0; i < num_workers; i++) {
+			if (!workers[i])
+				continue;
 
-		/* workers die when master socket close()s */
-		iobroker_close(nagios_iobs, workers[i]->sd);
-		free_wproc_memory(workers[i]);
-		my_free(workers[i]);
+			wproc_destroy(workers[i], flags);
+			workers[i] = NULL;
+		}
+
+		free(workers);
 	}
-	iobroker_destroy(nagios_iobs, 0);
-	nagios_iobs = NULL;
 	workers = NULL;
 	num_workers = 0;
 	worker_index = 0;
@@ -222,8 +265,6 @@
 	int result = ERROR;
 	check_result *cr = (check_result *)job->arg;
 
-	cr->output_file = NULL;
-	cr->output_file_fp = NULL;
 	memcpy(&cr->rusage, &wpres->rusage, sizeof(wpres->rusage));
 	cr->start_time.tv_sec = wpres->start.tv_sec;
 	cr->start_time.tv_usec = wpres->start.tv_usec;
@@ -494,20 +535,24 @@
 			break;
 		}
 		destroy_job(wp, job);
-		wp->jobs_running--;
 	}
 
 	return 0;
 }
 
-static int init_iobroker(void)
+int workers_alive(void)
 {
-	if (!nagios_iobs)
-		nagios_iobs = iobroker_create();
+	int i, alive = 0;
 
-	if (nagios_iobs)
+	if (!workers)
 		return 0;
-	return -1;
+
+	for (i = 0; i < num_workers; i++) {
+		if (wproc_is_alive(workers[i]))
+			alive++;
+	}
+
+	return alive;
 }
 
 int init_workers(int desired_workers)
@@ -519,7 +564,8 @@
 		desired_workers = 4;
 	}
 
-	init_iobroker();
+	if (workers_alive() == desired_workers)
+		return 0;
 
 	/* can't shrink the number of workers (yet) */
 	if (desired_workers < num_workers)
@@ -540,20 +586,31 @@
 	}
 
 	workers = wps;
-	for (; num_workers < desired_workers; num_workers++) {
+	for (i = 0; i < desired_workers; i++) {
+		int ret;
 		worker_process *wp;
 
+		if (wps[i])
+			continue;
+
 		wp = spawn_worker(worker_init_func, (void *)get_global_macros());
 		if (!wp) {
 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Failed to spawn worker: %s\n", strerror(errno));
-			free_worker_memory();
+			free_worker_memory(0);
 			return ERROR;
 		}
+		set_socket_options(wp->sd, 256 * 1024);
 
-		wps[num_workers] = wp;
-		iobroker_register(nagios_iobs, wp->sd, wp, handle_worker_result);
+		wps[i] = wp;
+		ret = iobroker_register(nagios_iobs, wp->sd, wp, handle_worker_result);
+		if (ret < 0) {
+			printf("Failed to register worker socket with iobroker %p\n", nagios_iobs);
+			exit(1);
+		}
 	}
+	num_workers = desired_workers;
 
+	logit(NSLOG_INFO_MESSAGE, TRUE, "Workers spawned: %d\n", num_workers);
 	return 0;
 }
 
@@ -573,6 +630,7 @@
 		/* XXX FIXME Fiddle with finding a new, less busy, worker here */
 	}
 	wp->jobs[job->id % wp->max_jobs] = job;
+	job->wp = wp;
 	return wp;
 
 	/* dead code below. for now */

Modified: nagioscore/trunk/include/workers.h
===================================================================
--- nagioscore/trunk/include/workers.h	2012-08-02 00:47:22 UTC (rev 2043)
+++ nagioscore/trunk/include/workers.h	2012-08-02 00:47:44 UTC (rev 2044)
@@ -14,11 +14,14 @@
 #define WPJOB_GLOBAL_HOST_EVTHANDLER 6
 #define WPJOB_HOST_EVTHANDLER 7
 
-extern void free_worker_memory(void);
+#define WPROC_FORCE  (1 << 0)
+
+extern void free_worker_memory(int flags);
 extern int init_workers(int desired_workers);
 extern int wproc_run_check(check_result *cr, char *cmd, nagios_macros *mac);
 extern int wproc_notify(char *cname, char *hname, char *sdesc, char *cmd, nagios_macros *mac);
 extern int wproc_run(int job_type, char *cmd, int timeout, nagios_macros *mac);
 extern int wproc_run_service_job(int jtype, int timeout, service *svc, char *cmd, nagios_macros *mac);
 extern int wproc_run_host_job(int jtype, int timeout, host *hst, char *cmd, nagios_macros *mac);
+extern int wproc_destroy(worker_process *wp, int flags);
 #endif

Modified: nagioscore/trunk/lib/worker.h
===================================================================
--- nagioscore/trunk/lib/worker.h	2012-08-02 00:47:22 UTC (rev 2043)
+++ nagioscore/trunk/lib/worker.h	2012-08-02 00:47:44 UTC (rev 2044)
@@ -30,17 +30,21 @@
 #define PAIR_SEP 0 /**< pair separator for buf2kvvec() and kvvec2buf() */
 #define KV_SEP '=' /**< key/value separator for buf2kvvec() and kvvec2buf() */
 
+struct worker_process;
+
 /** Worker job data */
 typedef struct worker_job {
 	int id;         /**< job id */
 	int type;       /**< internal only */
 	time_t timeout; /**< timeout, in absolute time */
 	char *command;  /**< command string for this job */
+	struct worker_process *wp; /**< worker process running this job */
 	void *arg;      /**< any random argument */
 } worker_job;
 
 /** A worker process as seen from its controller */
 typedef struct worker_process {
+	const char *type; /**< identifying typename of this worker */
 	int sd;    /**< communication socket */
 	pid_t pid; /**< pid */
 	int max_jobs; /**< Max number of jobs we can handle */

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





More information about the Nagios-commits mailing list