• R/O
  • SSH

pm_logconv-hb: Commit

※リポジトリは、https://github.com/linux-ha-japan/pm_logconv-hb-1.0 へ移行しました。

Pacemaker 対応ログメッセージ変換機能。

Heartbeat-2.1.4 用 hb-logconv(*) のPacemaker1.0 + Heartbeat スタック対応版。
(*) http://sourceforge.jp/projects/linux-ha/releases/?package_id=10282


Commit MetaInfo

Revision2d98f677a5a2ca5b17b934d3a60418dff44808b8 (tree)
Time2010-10-06 10:47:39
AuthorYoshihikoSATO
CommiterYoshihikoSATO

Log Message

Initial commit for pm_logconv - Pacemaker and Heartbeat log convert tool

Change Summary

Incremental Difference

diff -r 000000000000 -r 2d98f677a5a2 Makefile.am
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Makefile.am Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,34 @@
1+logconv_NAME = pm_logconv
2+logconv_SCRIPTS = $(logconv_NAME).py
3+logconv_CONFIG = $(logconv_NAME).conf
4+
5+MAINTAINERCLEANFILES = Makefile.in
6+logconvdir = @HA_NOARCHDATAHBDIR@/$(logconv_NAME)
7+logconvcfdir = @CONFIG_DIR@
8+
9+SHAREDIR = $(logconvdir)
10+SPEC = $(logconv_NAME).spec
11+TARFILE = $(PACKAGE_NAME)-$(VERSION).tar.gz
12+EXTRA_DIST = $(logconv_SCRIPTS) $(logconv_CONFIG) $(SPEC)
13+
14+install-data-hook:
15+ @$(NORMAL_INSTALL)
16+ test -z "$(logconvcfdir)" || $(mkdir_p) "$(DESTDIR)$(logconvcfdir)"
17+ $(INSTALL_DATA) "$(logconv_CONFIG)" "$(DESTDIR)$(logconvcfdir)"
18+
19+$(TARFILE):
20+ $(MAKE) dist
21+
22+RPM_ROOT = $(shell pwd)
23+RPMBUILDOPTS = --define "_sourcedir $(RPM_ROOT)" --define "_specdir $(RPM_ROOT)"
24+
25+srpm: clean
26+ rm -f $(TARFILE)
27+ $(MAKE) $(SPEC) $(TARFILE)
28+ rpmbuild $(RPMBUILDOPTS) --nodeps -bs --rmsource $(SPEC)
29+
30+rpm: clean
31+ rm -f $(TARFILE)
32+ $(MAKE) $(SPEC) $(TARFILE)
33+ rpmbuild $(RPMBUILDOPTS) -ba --rmsource $(SPEC)
34+
diff -r 000000000000 -r 2d98f677a5a2 autogen.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/autogen.sh Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,11 @@
1+#!/bin/sh
2+# Run this to generate all the initial makefiles, etc.
3+
4+echo Building configuration system...
5+autoreconf -i
6+if [ $? -ne 0 ]; then
7+ exit 1
8+fi
9+rm -rf autom4te.cache
10+echo Now run ./configure
11+
diff -r 000000000000 -r 2d98f677a5a2 configure.ac
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/configure.ac Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,56 @@
1+# -*- Autoconf -*-
2+# Process this file with autoconf to produce a configure script.
3+
4+AC_PREREQ([2.65])
5+AC_INIT([pm_logconv-hb], [1.0])
6+AM_INIT_AUTOMAKE
7+AC_PREFIX_DEFAULT(/usr)
8+PM_PKG="pacemaker"
9+
10+#
11+# check for python
12+#
13+AM_PATH_PYTHON(2.4,,:)
14+AC_PATH_PROGS(PYTHON, python)
15+AC_MSG_CHECKING(where is python installed)
16+if test "x${PYTHON}" = x; then
17+ PYTHON="/usr/bin/env python";
18+fi
19+AC_MSG_RESULT(using $PYTHON);
20+
21+prefix_orig="$prefix"
22+prefix=`eval echo "$prefix"`
23+case $prefix in
24+ NONE) prefix=/usr;;
25+esac
26+var(){
27+ case $1 in
28+ *'${'*) res=`eval echo "$1"`;;
29+ *) res="$1";;
30+ esac
31+ case "$res" in
32+ ""|NONE) echo "$2";;
33+ *) echo "$res";;
34+ esac
35+}
36+
37+#
38+# Keep copy of original (default) localstatedir
39+#
40+localstatedir_orig="$localstatedir"
41+
42+exec_prefix=`var "$exec_prefix" "$prefix"`
43+datadir=`var "$datadir" "$prefix/share"`
44+config_dir=`var "$config_dir" "/etc/"`
45+
46+CONFIG_DIR="$config_dir"
47+AC_SUBST(CONFIG_DIR)
48+HA_DATADIR="$datadir"
49+AC_SUBST(HA_DATADIR)
50+HA_NOARCHDATAHBDIR="$HA_DATADIR/$PM_PKG"
51+AC_SUBST(HA_NOARCHDATAHBDIR)
52+AC_PROG_LN_S
53+
54+AC_CONFIG_FILES([Makefile])
55+AC_OUTPUT
56+
diff -r 000000000000 -r 2d98f677a5a2 pm_logconv.conf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pm_logconv.conf Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,416 @@
1+#
2+# pm_logconv.conf : Config file of pm_logconv for Pacemaker and Heartbeat
3+#
4+# support version
5+# Pacemaker : stable-1.0 (1.0.9 or more)
6+# Heartbeat : 3.0.3
7+#
8+
9+[Settings]
10+#ha_log_path = /var/log/ha-log
11+#output_path = /var/log/pm_logconv.out
12+#hostcache_path = /var/lib/heartbeat/hostcache
13+#syslogformat = True
14+#reset_interval = 60
15+#attribute_pingd = default_ping_set, lt, 100
16+#attribute_diskd = diskcheck_status, eq, ERROR
17+#attribute_diskd_inner = diskcheck_status_internal, eq, ERROR
18+#logconv_logfacility = daemon
19+#act_rsc = prmExPostgreSQLDB, prmApPostgreSQLDB
20+
21+
22+###
23+# For Resource event.
24+###
25+#MsgNo.1-1, 2-1, 4-1, 5-1, 17-1, 18-1
26+[Resource tries to operation]
27+pattern_start=crmd,info:,do_lrm_rsc_op: Performing key,op,start
28+pattern_stop=crmd,info:,do_lrm_rsc_op: Performing key,op,stop
29+pattern_promote=crmd,info:,do_lrm_rsc_op:,Performing key, op,promote
30+pattern_demote=crmd,info:,do_lrm_rsc_op:,Performing key,op,demote
31+func=try_to_operate
32+
33+#MsgNo.1-2, 2-2, 4-2, 5-2, 17-2, 18-2
34+[Resource operation succeeded]
35+pattern_start=crmd,info:,process_lrm_event:,LRM operation,start,rc=0,!status=,ok
36+pattern_stop=crmd,info:,process_lrm_event:,LRM operation,stop,rc=0,!status=,ok
37+pattern_promote=crmd,info:,process_lrm_event,LRM operation,promote,rc=0,!status=,ok
38+pattern_demote=crmd,info:,process_lrm_event,LRM operation,demote,rc=0,!status=,ok
39+func=operation_succeeded
40+
41+#MsgNo.1-3, 2-3, 3-1, 4-3, 5-3, 17-3, 19-1
42+[Resource operation failed]
43+pattern_start=crmd,info:,process_lrm_event:,LRM operation,start,!rc=0,!status=
44+pattern_stop=crmd,info:,process_lrm_event:,LRM operation,stop,!rc=0,!status=
45+pattern_monitor=crmd,info:,process_lrm_event:,LRM operation,monitor,!monitor_0,!rc=0,!rc=8,!rc=7,!status=
46+pattern_promote=crmd,info:,process_lrm_event:,LRM operation,promote,!rc=0,!status=
47+pattern_demote=crmd,info:,process_lrm_event:,LRM operation,demote,!rc=0,!status=
48+func=operation_failed
49+loglevel=ERROR
50+
51+#MsgNo.1-4, 2-4, 3-3, 4-4, 5-4
52+[OCF resource operation timedout]
53+pattern_start=crmd,ERROR:,process_lrm_event:,LRM operation,start,!status=,Timed Out
54+pattern_stop=crmd,ERROR:,process_lrm_event:,LRM operation,stop,!status=,Timed Out
55+pattern_monitor=crmd,ERROR:,process_lrm_event:,LRM operation,monitor,!monitor_0,!status=,Timed Out
56+pattern_promote=crmd,ERROR:,process_lrm_event,LRM operation,promote,!status=,Timed Out
57+pattern_demote=crmd,ERROR:,process_lrm_event,LRM operation,demote,!status=,Timed Out
58+func=operation_timedout_ocf
59+
60+#MsgNo.3-2, 19-2
61+[Resource failure]
62+pattern_monitor_rcs=crmd,info:,process_lrm_event:,LRM operation,monitor,!monitor_0,rc=7,!status=
63+pattern_monitor_stonith=crmd,info:,process_lrm_event:,LRM operation,monitor,!monitor_0,rc=7,!status=
64+func=detect_rsc_failure
65+loglevel=ERROR
66+
67+###
68+# For Node status event.
69+##
70+#MsgNo.6-1, 6-2
71+[Node status updated]
72+pattern_dead=crmd,notice:,crmd_ha_status_callback:,Status update:,Node,now has status,dead
73+pattern_active=crmd,notice:,crmd_ha_status_callback:,Status update:,Node,now has status,active
74+func=node_status_updated
75+
76+###
77+# For Interconnect-LAN status event
78+# and Network status event (detected by pingd).
79+###
80+#MsgNo.7-1
81+[Interconnect-LAN status dead]
82+pattern=heartbeat,info:,Link,dead
83+func=detect_iconnlan_dead
84+loglevel=WARN
85+
86+#Msg No.7-2
87+[Interconnect-LAN or Network status up]
88+pattern=heartbeat,info:,Link,up
89+func=detect_network_up
90+
91+#MsgNo.8-1
92+[Network status dead]
93+pattern=pingd,info:,stand_alone_ping:,is unreachable
94+func=detect_node_dead
95+loglevel=ERROR
96+
97+###
98+# For Disk status event (detected by diskd).
99+###
100+#MsgNo.9-1
101+[Detect disk error]
102+pattern=diskd,WARN:,check_status:,disk status is changed,new_status,ERROR
103+func=detect_disk_error
104+loglevel=ERROR
105+
106+###
107+# For respawn process event.
108+###
109+#MsgNo.10-1
110+[Respawn process starts]
111+pattern=heartbeat,info:,Starting,as,uid,gid
112+func=respawn_start
113+
114+#MsgNo.10-2
115+[Respawn process exited abnormally]
116+pattern=heartbeat,Managed,process,exited with return code
117+func=respawn_exited_abnormally
118+loglevel=WARN
119+
120+#MsgNo.10-3
121+[Respawn process killed]
122+pattern=heartbeat,WARN:,Managed,process,killed by signal
123+func=respawn_killed
124+
125+#MsgNo.10-4
126+[Respawn process dumped core]
127+pattern=heartbeat,ERROR:,Managed,process,dumped core
128+func=respawn_dumped_core
129+loglevel=WARN
130+
131+#MsgNo.10-5
132+[Respawn process went away]
133+pattern=heartbeat,ERROR:,Managed,process,went away strangely
134+func=respawn_went_away
135+loglevel=WARN
136+
137+#MsgNo.10-6
138+[Respawn process exited normally]
139+pattern=heartbeat,info:,killing,process group,with signal
140+func=respawn_exited_normally
141+
142+#MsgNo.10-7
143+[Respawning too fast in a short term]
144+pattern=heartbeat,ERROR:,Client,respawning too fast
145+func=respawn_too_fast
146+
147+###
148+# For Fail Over. These are only for DC node.
149+##
150+#MsgNo.F0-1, F9-1, F10-1
151+[Detect calculation starts]
152+pattern=crmd,info:,do_state_transition:,State transition,-> S_POLICY_ENGINE,!I_SHUTDOWN
153+func=detect_pe_calc
154+loglevel=WARN
155+
156+#MsgNo.F0-2, F12-1, F12-2
157+[FailOver complete]
158+pattern=crmd,info:,do_state_transition:,State transition,-> S_IDLE
159+func=detect_fo_complete
160+loglevel=WARN
161+
162+#MsgNo.F1-1, F1-2, F2-1, F2-2, F3-1, F3-2, F4-1, F4-2, F6-1, F6-2
163+[Action failure]
164+pattern=crmd,WARN:,update_failcount:,Updating failcount for
165+func=dc_detect_failure
166+loglevel=WARN
167+fotrigger=1
168+
169+#MsgNo.F7-1, F7-2, F7-3, F7-4, F8-1
170+[Node failure]
171+pattern_shut=crmd,WARN:,match_down_event:,No match for shutdown action on
172+func=dc_detect_node_failure
173+fotrigger=3
174+
175+#MsgNo.F11-1
176+#The message is not output immediately, output when F/O is complete.
177+[Add Resource start action]
178+pattern=pengine,notice:,LogActions: Start
179+func=add_rsc_start
180+
181+#MsgNo.F11-2
182+#The message is not output immediately, output when F/O is complete.
183+[Add Resource stop action]
184+pattern=pengine,notice:,LogActions: Stop resource
185+func=add_rsc_stop
186+
187+#MsgNo.F11-3, F11-8, F11-9
188+#The message is not output immediately, output when F/O is complete.
189+[Add no action]
190+pattern_leave_start=pengine,notice:,LogActions: Leave resource
191+pattern_restart=pengine,notice:,LogActions: Restart resource
192+func=add_no_action
193+
194+#MsgNo.F11-4
195+#The message is not output immediately, output when F/O is complete.
196+[Resource cannot run anywhere]
197+pattern=pengine,WARN:,native_color:,Resource,cannot run anywhere
198+func=detect_cannot_run_anywhere
199+
200+#MsgNo.F11-5
201+#The message is not output immediately, output when F/O is complete.
202+[Detect resource unmanaged]
203+pattern=pengine,info:,native_color:,Unmanaged resource,allocated to
204+func=detect_rsc_unmanaged
205+
206+#MsgNo.F11-6
207+#The message is not output immediately, output when F/O is complete.
208+[Add Resource move action]
209+pattern=pengine,notice:,LogActions: Move resource
210+func=add_rsc_move
211+
212+###
213+# For DC election.
214+###
215+#Msg No.13-2
216+[DC election is complete]
217+pattern=crmd,info:,update_dc:,Set DC to
218+func=dc_election_complete
219+
220+#Msg No.13-5
221+[Detect unset DC]
222+pattern=crmd,info:,update_dc:,Unset DC
223+func=detect_unset_dc
224+
225+###
226+# For Corosync service shutdown.
227+###
228+#Msg No.14-1 (only for DC)
229+[Corosync on the node in the cluster want to shutdown]
230+pattern=crmd,info:,handle_shutdown_request:,Creating shutdown request for
231+func=detect_shutdown_request
232+
233+#Msg No.14-2
234+[Heartbeat shutdown complete.]
235+pattern=heartbeat,info:,Heartbeat shutdown complete
236+func=detect_hb_shutdown
237+
238+#Msg No.14-3
239+[Pacemaker is shutting down.]
240+pattern=crmd,info:,crm_shutdown: Requesting shutdown
241+func=detect_pcmk_shutting_down
242+
243+#Msg No.14-4 (for DC node shutdown)
244+[DC node want to shutdown]
245+pattern=cib,info:,cib_process_shutdown_req:,Shutdown REQ from
246+func=detect_dc_shutdown_request
247+
248+#Msg No.14-5
249+[Send shutdown request to DC.]
250+pattern=crmd,info:,do_shutdown_req: Sending shutdown request to DC:
251+func=detect_send_shutdown
252+
253+###
254+# For logging daemon event.
255+###
256+#Msg No.15-1
257+[Detect logd started]
258+pattern=logd,info:,logd started with
259+func=output_original_log
260+
261+#Msg No.16-1
262+[logd is shutting down.]
263+pattern=logd,info:,logd_term_write_action:,received
264+func=output_static_msg
265+
266+#Msg No.16-2
267+[logd stopped.]
268+pattern=logd,info:,Exiting write process
269+func=output_static_msg
270+
271+###
272+# For STONITH resource operation timed out.
273+###
274+#For Msg No.17-4, 19-3
275+#The message is not output immediately, output when operation complete.
276+[Resource operation timed out for stonith]
277+pattern=stonithd,WARN:,process,timed out,try,Killing with signal
278+func=detect_rscop_timedout_stonithd
279+
280+###
281+# For fence operation.
282+###
283+#Msg No.20-1, No21-1
284+[fence operation start]
285+pattern=stonithd,info:,stonith_operate_locally,sending fencing op,for,to
286+func=fence_op_started
287+
288+#Msg No.20-2
289+[fence operation succeeded]
290+pattern=stonithd,info:,Succeeded to STONITH the node
291+func=fence_op_succeeded
292+
293+#Msg No.20-3, 21-3
294+[fence operation failed]
295+pattern=stonithd,info:,failed to STONITH node,with local device
296+func=fence_op_failed
297+loglevel=ERROR
298+
299+#Msg No.20-4, 21-4
300+[fence operation timedout]
301+pattern=stonithd,ERROR:,Failed to STONITH the node,optype,op_result,TIMEOUT
302+func=fence_op_timedout
303+
304+###
305+# For attribute event.
306+###
307+#Msg No.22-1
308+[Detect attribute updated]
309+pattern=attrd,info:,attrd_perform_update:,Sent update,!fail-count-,!last-failure-,!probe_complete,!shutdown,!master-
310+func=detect_attr_updated
311+
312+#Msg No.22-2
313+[Detect attribute deleted]
314+pattern=attrd,info:,attrd_perform_update:,Sent delete,!delete -,!fail-count-,!last-failure-,!probe_complete,!shutdown,!master-
315+func=detect_attr_deleted
316+
317+###
318+# For Heartbeat service starts.
319+###
320+#Msg No.23-1
321+[Detect heartbeat is starting]
322+pattern=heartbeat,info:,Configuration validated,Starting heartbeat
323+func=detect_hb_start
324+
325+#Msg No.23-3
326+#It's just for clear ConvertStatus. Output nothing.
327+[Detect localhost status is set to up]
328+pattern=heartbeat,info:,Local status now set to,up
329+func=detect_localstat_up
330+ignoremsg=True
331+
332+###
333+# For log message dropping.
334+###
335+#Msg No.25-1
336+[Detect log dropped]
337+pattern=ERROR:,cl_log:,messages were dropped
338+func=output_original_log
339+
340+###
341+# For Core process event.
342+###
343+#Msg No.28-1
344+[FIFO process start to restart]
345+pattern=heartbeat,WARN:,Restarting,process
346+func=output_original_log
347+
348+#Msg No.28-2
349+[FIFO process restart failed]
350+pattern=heartbeat,ERROR:,restart failed,Restarting heartbeat
351+func=output_original_log
352+
353+#Msg No.28-3
354+[I/O processes failed]
355+pattern=heartbeat,ERROR:,process died,Beginning communications restart process for comm channel
356+func=output_original_log
357+loglevel=WARN
358+
359+#Msg No.28-4
360+[I/O processes start to restart]
361+pattern=heartbeat,ERROR:,Both comm processes for channel,have died,Restarting
362+func=output_original_log
363+loglevel=WARN
364+
365+#Msg No.28-5
366+[I/O processes restart succeeded]
367+pattern=heartbeat,info:,Communications restart succeeded
368+func=output_original_log
369+
370+#Msg No.28-6
371+[I/O processes failed to restart]
372+pattern=heartbeat,ERROR:,Communications restart failed,Will try again later
373+func=output_original_log
374+
375+###
376+# For pengine event.
377+###
378+#Msg No.29-1
379+[pengine start]
380+pattern=crmd,info:,start_subsystem:,Starting sub-system
381+func=crmd_subsystem_start
382+
383+#Msg No.29-2
384+[pengine exit]
385+pattern=crmd,info:,crmdManagedChildDied:,Process,exited \(signal=0,exitcode=
386+func=crmd_subsystem_exit
387+
388+#Msg No.29-3
389+[pengine kill]
390+pattern=crmd,info:,crmdManagedChildDied:,Process,exited \(signal=,exitcode=,!\(signal=0
391+func=crmd_subsystem_kill
392+loglevel=ERROR
393+
394+###
395+# Other process's failure
396+###
397+#Msg No.30-1
398+[master control process failure]
399+pattern=heartbeat,CRIT:,Emergency Shutdown:,Master Control process died
400+func=output_original_log
401+loglevel=ERROR
402+
403+#Msg No.30-2
404+[OS reboot because of process's failure]
405+pattern=heartbeat,EMERG:,Rebooting system,Reason:
406+func=output_original_log
407+loglevel=ERROR
408+
409+###
410+# Others.
411+###
412+#Msg No.27-1
413+[Detect a request for getting DC node state]
414+pattern=crmd,info:,handle_request:,Current ping state:
415+func=detect_dcstat_req
416+ignoremsg=True
diff -r 000000000000 -r 2d98f677a5a2 pm_logconv.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pm_logconv.py Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,3423 @@
1+#!/usr/bin/python
2+# -*- coding: utf-8 -*-
3+
4+# pm_logconv : Pacemaker and Heartbeat log converter
5+#
6+# support version
7+# Pacemaker : stable-1.0 (1.0.9 or more)
8+# Heartbeat : 3.0.3
9+#
10+# Copyright (C) 2010 NIPPON TELEGRAPH AND TELEPHONE CORPORATION
11+#
12+# This program is free software; you can redistribute it and/or modify
13+# it under the terms of the GNU General Public License as published by
14+# the Free Software Foundation; either version 2 of the License, or
15+# (at your option) any later version.
16+#
17+# This program is distributed in the hope that it will be useful,
18+# but WITHOUT ANY WARRANTY; without even the implied warranty of
19+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20+# GNU General Public License for more details.
21+#
22+# You should have received a copy of the GNU General Public License
23+# along with this program; if not, write to the Free Software
24+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25+
26+import os, sys, signal, time, datetime, syslog, types, glob, pickle
27+import ConfigParser, re, commands, operator, string
28+from optparse import OptionParser
29+from stat import ST_INO, ST_NLINK, ST_SIZE, S_IRUSR, S_IWUSR
30+from socket import gethostname
31+from errno import ESRCH
32+
33+#
34+# version number of pm_logconv.
35+#
36+VERSION = "1.0"
37+
38+#
39+# system's host name.
40+#
41+try:
42+ HOSTNAME = gethostname()
43+except Exception, strerror:
44+ print >> sys.stderr, "Error: gethostname() error occurred.", strerror
45+ sys.exit(1)
46+
47+#
48+# default settings.
49+# (when not specified with configuration file or command line option.)
50+#
51+CONFIGFILE = "/etc/pm_logconv.conf"
52+HA_LOGFILE = "/var/log/ha-log"
53+OUTPUTFILE = "/var/log/pm_logconv.out"
54+SYSLOGFORMAT = True
55+HOSTCACHE = "/var/lib/heartbeat/hostcache"
56+HACFFILE = "/etc/ha.d/ha.cf"
57+
58+#
59+# Timeout(ms) for reset log convert status.
60+#
61+RESET_INTERVAL = 60
62+
63+# A flag of failer status
64+# resource failer 1(resource error)
65+# score failer 2(pingd rsclocation)
66+# node failer 3(split brain)
67+FAIL_RSC = "1"
68+FAIL_SCORE = "2"
69+FAIL_NODE = "3"
70+
71+# A flag of resource status(for failer)
72+# resource start 1
73+# resource move 2
74+# resource stop 3
75+# resource stopped 4
76+FAIL_STR = "1"
77+FAIL_MOVE = "2"
78+FAIL_STP = "3"
79+FAIL_STPD = "4"
80+
81+#
82+# A list of [attribute_name, operation, attribute_value],
83+# The setting is described in CONFIGFILE.
84+# These are to decide whether some failure occur or not
85+# when cluster status changes to S_POLICY_ENGINE.
86+#
87+attrRuleList = list()
88+
89+# A list of resource-id.
90+# If the all of specified resources are active,
91+# it means "F/O succeeded."
92+# If not, "F/O failed."
93+# The setting is described in CONFIGFILE.
94+actRscList = list()
95+
96+#
97+# A list of patterns.
98+# The setting is described in CONFIGFILE.
99+#
100+lconvRuleList = list()
101+
102+#
103+# shutdown flag, when SIGINT or SIGTERM signal is received, set it True.
104+#
105+do_shutdown = False
106+
107+#
108+# command name for getting current status of the cluster.
109+#
110+CMD_CRM_ATTR = "crm_attribute"
111+
112+#
113+# command name for getting current node status of the cluster.
114+#
115+CMD_CRM_NODE = "crm_node"
116+
117+#
118+# command name for getting DC node status.
119+#
120+CMD_CRMADMIN = "crmadmin"
121+
122+#
123+# output version number of pm_logconv and exit.
124+#
125+def print_version(option, opt, value, parser):
126+ sys.stdout.write("%s\n" % VERSION)
127+ sys.exit(0)
128+
129+#
130+# signal handler method. only set True to the shutdown flag.
131+#
132+def shutdown_logconv(signum, frame):
133+ global do_shutdown
134+ pm_log.info("shutdown_logconv: received signal [%d], " \
135+ "scheduling shutdown.." % signum)
136+ do_shutdown = True
137+
138+#
139+# set the signal handler.
140+#
141+signal.signal(signal.SIGINT, shutdown_logconv)
142+signal.signal(signal.SIGTERM, shutdown_logconv)
143+
144+
145+class LogconvLog:
146+ LOG_EMERG = 0
147+ LOG_ALERT = 1
148+ LOG_CRIT = 2
149+ LOG_ERR = 3
150+ LOG_WARNING = 4
151+ LOG_NOTICE = 5
152+ LOG_INFO = 6
153+ LOG_DEBUG = 7
154+
155+ syspriority = [ syslog.LOG_EMERG, syslog.LOG_ALERT, syslog.LOG_CRIT,
156+ syslog.LOG_ERR, syslog.LOG_WARNING, syslog.LOG_NOTICE,
157+ syslog.LOG_INFO, syslog.LOG_DEBUG ]
158+
159+ prioritystr = [ "EMERG", "ALERT", "CRIT", "ERROR", "WARN",
160+ "notice", "info", "debug" ]
161+
162+ DEFAULT_LOGOPT = syslog.LOG_CONS
163+ DEFAULT_FACILITY = syslog.LOG_DAEMON
164+
165+ facility_map = {
166+ "kern": syslog.LOG_KERN,
167+ "user": syslog.LOG_USER,
168+ "mail": syslog.LOG_MAIL,
169+ "daemon": syslog.LOG_DAEMON,
170+ "auth": syslog.LOG_AUTH,
171+ "syslog": syslog.LOG_SYSLOG,
172+ "lpr": syslog.LOG_LPR,
173+ "news": syslog.LOG_NEWS,
174+ "uucp": syslog.LOG_UUCP,
175+ "cron": syslog.LOG_CRON,
176+ "authpriv": 10<<3,
177+ "ftp": 11<<3,
178+ "local0": syslog.LOG_LOCAL0,
179+ "local1": syslog.LOG_LOCAL1,
180+ "local2": syslog.LOG_LOCAL2,
181+ "local3": syslog.LOG_LOCAL3,
182+ "local4": syslog.LOG_LOCAL4,
183+ "local5": syslog.LOG_LOCAL5,
184+ "local6": syslog.LOG_LOCAL6,
185+ "local7": syslog.LOG_LOCAL7,
186+ }
187+
188+ facilitystr_map = {
189+ syslog.LOG_KERN: "kern",
190+ syslog.LOG_USER: "user",
191+ syslog.LOG_MAIL: "mail",
192+ syslog.LOG_DAEMON: "daemon",
193+ syslog.LOG_AUTH: "auth",
194+ syslog.LOG_SYSLOG: "syslog",
195+ syslog.LOG_LPR: "lpr",
196+ syslog.LOG_NEWS: "news",
197+ syslog.LOG_UUCP: "uucp",
198+ syslog.LOG_CRON: "cron",
199+ 10<<3: "authpriv",
200+ 11<<3: "ftp",
201+ syslog.LOG_LOCAL0: "local0",
202+ syslog.LOG_LOCAL1: "local1",
203+ syslog.LOG_LOCAL2: "local2",
204+ syslog.LOG_LOCAL3: "local3",
205+ syslog.LOG_LOCAL4: "local4",
206+ syslog.LOG_LOCAL5: "local5",
207+ syslog.LOG_LOCAL6: "local6",
208+ syslog.LOG_LOCAL7: "local7",
209+ }
210+
211+ facilitystr = facilitystr_map[DEFAULT_FACILITY]
212+
213+ def __init__(self, priority, path):
214+ self.pid = os.getpid()
215+
216+ if not isinstance(priority, int) and not isinstance(priority, long):
217+ self.priority = self.LOG_INFO
218+ else:
219+ self.priority = priority
220+
221+ if not isinstance(path, types.StringTypes):
222+ self.output = None
223+ else:
224+ self.output = path
225+
226+ self.facility = self.DEFAULT_FACILITY
227+ syslog.openlog("pm_logconv", self.DEFAULT_LOGOPT, self.facility)
228+
229+ def __setattr__(self, name, val):
230+ if name != "LOG_EMERG" and name != "LOG_ALERT" and \
231+ name != "LOG_CRIT" and name != "LOG_ERR" and \
232+ name != "LOG_WARNING" and name != "LOG_NOTICE" and \
233+ name != "LOG_INFO" and name != "LOG_DEBUG" and \
234+ name != "DEFAULT_LOGOPT" and name != "DEFAULT_FACILITY":
235+ self.__dict__[name] = val
236+
237+ def set_priority(self, priority):
238+ if not isinstance(priority, int) and not isinstance(priority, long):
239+ return False
240+ if self.LOG_EMERG < priority and self.DEBUG > priority:
241+ return False
242+ self.priority = priority
243+ return True
244+
245+ def set_output(self, path):
246+ if not isinstance(path, types.StringTypes):
247+ return False
248+ self.output = path
249+ return True
250+
251+ def set_facility(self, facility):
252+ # FYI: LOG_AUTHPRIV : 10<<3
253+ # LOG_FTP : 11<<3
254+ if self.facility == facility:
255+ return True
256+ if self.facilitystr_map.has_key(facility):
257+ pm_log.notice("syslog facility changed [%s] to [%s]"
258+ % (self.facilitystr, self.facilitystr_map[facility]))
259+ syslog.closelog()
260+ self.facility = facility
261+ syslog.openlog("pm_logconv", self.DEFAULT_LOGOPT, self.facility)
262+ self.facilitystr = self.facilitystr_map[facility]
263+ return True
264+ return False
265+
266+ def emerg(self, message):
267+ if self.output == None or self.priority >= self.LOG_EMERG:
268+ return self.logging(self.LOG_EMERG, message)
269+ return True
270+
271+ def alert(self, message):
272+ if self.output == None or self.priority >= self.LOG_ALERT:
273+ return self.logging(self.LOG_ALERT, message)
274+ return True
275+
276+ def crit(self, message):
277+ if self.output == None or self.priority >= self.LOG_CRIT:
278+ return self.logging(self.LOG_CRIT, message)
279+ return True
280+
281+ def error(self, message):
282+ if self.output == None or self.priority >= self.LOG_ERR:
283+ return self.logging(self.LOG_ERR, message)
284+ return True
285+
286+ def warn(self, message):
287+ if self.output == None or self.priority >= self.LOG_WARNING:
288+ return self.logging(self.LOG_WARNING, message)
289+ return True
290+
291+ def notice(self, message):
292+ if self.output == None or self.priority >= self.LOG_NOTICE:
293+ return self.logging(self.LOG_NOTICE, message)
294+ return True
295+
296+ def info(self, message):
297+ if self.output == None or self.priority >= self.LOG_INFO:
298+ return self.logging(self.LOG_INFO, message)
299+ return True
300+
301+ def debug(self, message):
302+ if self.output == None or self.priority >= self.LOG_DEBUG:
303+ return self.logging(self.LOG_DEBUG, message)
304+ return True
305+
306+ def logging(self, priority, message):
307+ try:
308+ if not isinstance(priority, int) and not isinstance(priority, long):
309+ return False
310+ if not isinstance(message, types.StringTypes):
311+ return False
312+
313+ if self.output == None:
314+ syslog.syslog(self.syspriority[priority], "[%d]: %-7s %s" %
315+ (self.pid, self.prioritystr[priority] + ':', message.rstrip()))
316+ else:
317+ t = datetime.datetime.today()
318+ tfmt = "%s %2d %s" % \
319+ (t.strftime('%b'), int(t.strftime('%d')), t.strftime('%X'))
320+ f = open(self.output, 'a')
321+ f.write("%s %s [%d]: %-7s %s\n" % (tfmt, HOSTNAME, self.pid,
322+ self.prioritystr[priority] + ':', message.rstrip()))
323+ f.close()
324+ return True
325+ except Exception, strerror:
326+ print >> sys.stderr, "Error: logging() error occurred.", strerror
327+ sys.exit(1)
328+
329+class PIDFile:
330+ '''
331+ status of the PID file operation.
332+ '''
333+ SYSTEM_ERROR = -1
334+ FILE_NOTEXIST = -2
335+ FILE_INVALID = -3
336+ NOTRUNNING = -4
337+
338+ def __init__(self, path):
339+ self.path = path
340+
341+ '''
342+ status is set as read-only.
343+ '''
344+ def __setattr__(self, name, val):
345+ if name != "SYSTEM_ERROR" and name != "FILE_NOTEXIST" and \
346+ name != "FILE_INVALID" and name != "NOTRUNNING":
347+ self.__dict__[name] = val
348+
349+ '''
350+ check whether the process of the PID file has running.
351+ return 0 > : process is running.
352+ SYSTEM_ERROR : system error occurred.
353+ NOTRUNNING : process is NOT running.
354+ '''
355+ def is_running(self, pid, cmdline):
356+ try:
357+ os.kill(pid, 0)
358+ except Exception, (errNo, strerror):
359+ if errNo == ESRCH:
360+ pm_log.debug("is_running: pm_logconv isn't running.")
361+ return self.NOTRUNNING
362+ else:
363+ pm_log.error("is_running: kill(%d, 0) error occurred." % pid)
364+ pm_log.debug("is_running: kill(%d, 0) error occurred. [%s]"
365+ % (pid, strerror))
366+ return self.SYSTEM_ERROR
367+
368+ # check to make sure pid hasn't been reused by another process.
369+ try:
370+ proc_path = "/proc/%d/cmdline" % pid
371+ f = open(proc_path, 'r')
372+ cmdline_now = f.readline().replace('\0', ' ').strip()
373+ f.close()
374+
375+ pm_log.debug("is_running: tracked[%s], /proc/%d/cmdline[%s]"
376+ % (cmdline, pid, cmdline_now))
377+ if cmdline != cmdline_now:
378+ return self.NOTRUNNING
379+ except Exception, strerror:
380+ pm_log.error("is_running: couldn't read from '%s'." % proc_path)
381+ pm_log.debug("is_running: couldn't read from '%s'. %s"
382+ % (proc_path, strerror))
383+ return self.SYSTEM_ERROR
384+ return pid
385+
386+ '''
387+ read PID file.
388+ return 0 > : process is running. return running process's PID.
389+ SYSTEM_ERROR : system error occurred.
390+ FILE_NOTEXIST : PID file doesn't exist.
391+ FILE_INVALID : PID file is broken...
392+ NOTRUNNING : succeeded. process is NOT running.
393+ '''
394+ def read(self):
395+ try:
396+ if os.path.exists(self.path):
397+ f = open(self.path, 'r')
398+ pid = f.readline().strip()
399+ cmdline = f.readline().strip('\n')
400+ f.close()
401+
402+ if pid.isdigit() and int(pid) != os.getpid():
403+ return self.is_running(int(pid), cmdline)
404+ else:
405+ pm_log.warn("PIDFile.read: PID file is screwed up.")
406+ return self.FILE_INVALID
407+ else:
408+ pm_log.info("PIDFile.read: PID file doesn't exist.")
409+ return self.FILE_NOTEXIST
410+ except Exception, strerror:
411+ pm_log.error("PIDFile.read: I/O error occurred.")
412+ pm_log.debug("PIDFile.read: I/O error occurred. [%s]" % strerror)
413+ return self.SYSTEM_ERROR
414+
415+ '''
416+ lock PID file.
417+ return 0 : succeeded.
418+ 0 > : return already running process's PID.
419+ SYSTEM_ERROR : system error occurred.
420+ '''
421+ def lock(self):
422+ try:
423+ ret = self.read()
424+ if ret > 0 or ret == self.SYSTEM_ERROR:
425+ return ret
426+ elif ret == self.FILE_NOTEXIST:
427+ pass
428+ elif ret == self.FILE_INVALID or ret == self.NOTRUNNING:
429+ os.remove(self.path)
430+ else:
431+ return self.SYSTEM_ERROR
432+ except Exception, strerror:
433+ pm_log.error("PIDFile.lock: I/O error occurred.")
434+ pm_log.debug("PIDFile.lock: I/O error occurred. [%s]" % strerror)
435+ return self.SYSTEM_ERROR
436+
437+ try:
438+ pid = os.getpid()
439+ f = open("/proc/%d/cmdline" % pid, 'r')
440+ cmdline = f.readline().replace('\0', ' ').strip()
441+ f.close()
442+
443+ tfile = ("%s.%d" % (self.path, pid))
444+ f = open(tfile, 'w')
445+ f.write("%d\n%s\n" % (pid, cmdline))
446+ f.close()
447+
448+ os.link(tfile, self.path)
449+ nlink = os.stat(tfile)[ST_NLINK]
450+ os.remove(tfile)
451+ except Exception, strerror:
452+ pm_log.error("PIDFile.lock: I/O error occurred.")
453+ pm_log.debug("PIDFile.lock: I/O error occurred. [%s]" % strerror)
454+
455+ try:
456+ f.close()
457+ os.remove(tfile)
458+ except:
459+ pass
460+ return self.SYSTEM_ERROR
461+
462+ if nlink < 2:
463+ # somehow, it didn't get through - NFS trouble?
464+ return self.SYSTEM_ERROR
465+ return 0
466+
467+class ConvertStatus:
468+ def __init__(self):
469+ self.ino = 0
470+ self.offset = 0
471+ self.FAILURE_OCCURRED = False
472+ self.IN_CALC = False
473+ self.ACTRSC_MOVE = False
474+ self.IN_FO_PROCESS = False
475+ self.timedoutRscopSet = set()
476+ self.shutNodeSet = set()
477+
478+cstat = ConvertStatus()
479+
480+class StatusFile:
481+ def __init__(self, path):
482+ self.path = path
483+ self.w_ino = 0
484+ self.w_offset = 0
485+ self.in_calc = False
486+
487+ '''
488+ read from status(read position of ha-log and status of convert) file.
489+ '''
490+ def read(self):
491+ try:
492+ if os.path.exists(self.path):
493+ f = os.open(self.path, os.O_RDONLY)
494+ c = pickle.loads(os.read(f, os.stat(self.path)[ST_SIZE]))
495+ os.close(f)
496+ cstat.ino = self.w_ino = c.ino
497+ cstat.offset = self.w_offset = c.offset
498+ cstat.FAILURE_OCCURRED = c.FAILURE_OCCURRED
499+ cstat.IN_CALC = self.in_calc = c.IN_CALC
500+ cstat.ACTRSC_MOVE = c.ACTRSC_MOVE
501+ cstat.IN_FO_PROCESS = c.IN_FO_PROCESS
502+ cstat.timedoutRscopSet = c.timedoutRscopSet
503+ cstat.shutNodeSet = c.shutNodeSet
504+ else:
505+ pm_log.info("StatusFile.read: status file doesn't exist.")
506+ self.clear_cstat()
507+ pm_log.debug("StatusFile.read: [%d:%d], FAIL[%s], IN_CALC[%s], "\
508+ "RSC_MOVE[%s], IN_FO[%s], Rscop%s, Node%s" %
509+ (cstat.ino, cstat.offset, cstat.FAILURE_OCCURRED,
510+ cstat.IN_CALC, cstat.ACTRSC_MOVE, cstat.IN_FO_PROCESS,
511+ list(cstat.timedoutRscopSet), list(cstat.shutNodeSet)))
512+ return True
513+ except Exception, strerror:
514+ pm_log.error("StatusFile.read: I/O error occurred.")
515+ pm_log.debug("StatusFile.read: I/O error occurred. [%s]" % strerror)
516+ self.clear_cstat()
517+ return False
518+
519+ '''
520+ write to status(reading ha-log's position and status of convert) file.
521+ '''
522+ def write(self):
523+ if cstat.IN_CALC:
524+ if self.in_calc:
525+ return True
526+ self.in_calc = True
527+ else:
528+ self.in_calc = False
529+ self.w_ino = cstat.ino
530+ self.w_offset = cstat.offset
531+
532+ try:
533+ # current implementation writes to the statfile with os.write().
534+ # since between built-in function write() and close(), file becomes empty.
535+ f = os.open(self.path, os.O_WRONLY | os.O_CREAT, S_IRUSR | S_IWUSR)
536+ l = os.write(f, pickle.dumps(cstat, pickle.HIGHEST_PROTOCOL))
537+ os.ftruncate(f, l)
538+ os.close(f)
539+ pm_log.debug("StatusFile.write: [%d:%d], FAIL[%s], IN_CALC[%s], "\
540+ "RSC_MOVE[%s], IN_FO[%s], Rscop%s, Node%s" %
541+ (cstat.ino, cstat.offset, cstat.FAILURE_OCCURRED,
542+ cstat.IN_CALC, cstat.ACTRSC_MOVE, cstat.IN_FO_PROCESS,
543+ list(cstat.timedoutRscopSet), list(cstat.shutNodeSet)))
544+ return True
545+ except Exception, strerror:
546+ pm_log.error("StatusFile.write: I/O error occurred.")
547+ pm_log.debug("StatusFile.write: I/O error occurred. [%s]" % strerror)
548+ return False
549+
550+ def clear_cstat(self):
551+ global cstat
552+ pm_log.debug("clear_cstat: called.")
553+ cstat = ConvertStatus()
554+ self.w_ino = cstat.ino
555+ self.w_offset = cstat.offset
556+ self.in_calc = cstat.IN_CALC
557+ return
558+
559+statfile = None
560+
561+class ParseConfigFile:
562+ '''
563+ Initialization to parse config file.
564+ Open the config file. Its fd should be close in __del__().
565+ '''
566+ def __init__(self, config_file):
567+ self.SEC_SETTINGS = "Settings"
568+ self.OPT_HA_LOG_PATH = "ha_log_path"
569+ self.OPT_HACF_PATH = "hacf_path"
570+ self.OPT_OUTPUT_PATH = "output_path"
571+ self.OPT_DATEFORMAT = "syslogformat"
572+ self.OPT_HOSTCACHE = "hostcache_path"
573+ self.OPT_MANAGE_ATTR = "attribute"
574+ self.OPT_PATTERN = "pattern"
575+ self.OPT_RESET_INTERVAL = "reset_interval"
576+ self.OPT_FUNCNAME = "func"
577+ self.OPT_LOGLEVEL = "loglevel"
578+ self.OPT_FOTRIGGER = "fotrigger"
579+ self.OPT_IGNOREMSG = "ignoremsg"
580+
581+ self.OPT_LOGFACILITY = "logconv_logfacility"
582+ self.logfacility = None
583+
584+ self.OPT_ACTRSC = "act_rsc"
585+
586+ self.fp = None
587+ self.cf = ConfigParser.RawConfigParser()
588+ # open the config file to read.
589+ if not os.path.exists(config_file):
590+ pm_log.error("ParseConfigFile.__init__(): " +
591+ "config file [%s] does not exist." % (config_file))
592+ #__init__ should return None...
593+ sys.exit(1)
594+ try:
595+ self.fp = open(config_file)
596+ self.cf.readfp(self.fp)
597+ except Exception, strerror:
598+ pm_log.error("ParseConfigFile.__init__(): " +
599+ "failed to read config file [%s]." % (config_file))
600+ pm_log.debug("ParseConfigFile.__init__(): %s" % (strerror))
601+ #__init__ should return None...
602+ sys.exit(1)
603+
604+ def __del__(self):
605+ if self.fp is not None:
606+ self.fp.close()
607+
608+ def get_optval(self, secname, optname):
609+ optval = None
610+ try:
611+ optval = self.cf.get(secname, optname)
612+ except Exception, strerror:
613+ pm_log.warn("get_optval(): " +
614+ "failed to get value of \"%s\" in [%s] section. " %
615+ (optname, secname))
616+ pm_log.debug("get_optval(): %s" % (strerror))
617+ return None
618+
619+ if optval == "":
620+ pm_log.warn("get_optval(): " +
621+ "the value of \"%s\" in [%s] section is empty. " %
622+ (optname, secname))
623+ return None
624+ return optval
625+
626+ '''
627+ Parse [Settings] section.
628+ return 0 : succeeded.
629+ 0 > : error occurs.
630+ '''
631+ def parse_basic_settings(self):
632+ global HA_LOGFILE
633+ global HACFFILE
634+ global OUTPUTFILE
635+ global SYSLOGFORMAT
636+ global HOSTCACHE
637+ global RESET_INTERVAL
638+ global attrRuleList
639+ global actRscList
640+
641+ # Get all options in the section.
642+ try:
643+ setting_opts = self.cf.options(self.SEC_SETTINGS)
644+ except:
645+ pm_log.warn("parse_basic_settings(): " +
646+ "[%s] section does not exist. " % (self.SEC_SETTINGS))
647+ return (-1)
648+
649+ for optname in setting_opts:
650+ optval = self.get_optval(self.SEC_SETTINGS, optname)
651+ if not optval:
652+ pm_log.warn("parse_basic_settings(): " +
653+ "Ignore the setting of \"%s\"." % (optname))
654+ continue # To the next option in [Settings].
655+
656+ if optname == self.OPT_HA_LOG_PATH:
657+ HA_LOGFILE = optval
658+ elif optname == self.OPT_HACF_PATH:
659+ HACFFILE = optval
660+ elif optname == self.OPT_OUTPUT_PATH:
661+ OUTPUTFILE = optval
662+ elif optname == self.OPT_DATEFORMAT:
663+ if optval.lower() == "true":
664+ SYSLOGFORMAT = True
665+ elif optval.lower() == "false":
666+ SYSLOGFORMAT = False
667+ else:
668+ pm_log.warn("parse_basic_settings(): " +
669+ "the value of \"%s\" is invalid. " % (optname) +
670+ "Ignore the setting.")
671+ elif optname == self.OPT_HOSTCACHE:
672+ HOSTCACHE = optval
673+ elif optname == self.OPT_RESET_INTERVAL:
674+ try:
675+ tmpval = int(optval)
676+ # 1 to 32bit integer max value
677+ if tmpval > 0 and tmpval <= 2147483647:
678+ RESET_INTERVAL = tmpval
679+ else:
680+ raise
681+ except:
682+ pm_log.warn("parse_basic_settings(): " +
683+ "the value of \"%s\" is invalid. " % (optname) +
684+ "set an default value(60).")
685+ elif optname.startswith(self.OPT_MANAGE_ATTR):
686+ attrRule = optval.split(',')
687+ if len(attrRule) != 3:
688+ pm_log.warn("parse_basic_settings(): " +
689+ "the format of \"%s\" is invalid. " % (optname) +
690+ "Ignore the setting.")
691+ continue # To the next option in [Settings].
692+ (attrname, op, attrval) = tuple(attrRule)
693+ attrname = attrname.strip()
694+ op = op.strip()
695+ attrval = attrval.strip()
696+ if attrname == "" or op == "" or attrval == "":
697+ pm_log.warn("parse_basic_settings(): " +
698+ "the value of \"%s\" is invalid. " % (optname) +
699+ "Ignore the setting.")
700+ continue # To the next option in [Settings].
701+
702+ '''
703+ op string should be [lt|gt|lte|gte|eq|ne] in cib.xml.
704+ However, with operator module of Python,
705+ "lte" is expressed "le", and "gte" is "ge".
706+ Here, replace op string to use it as function name.
707+ '''
708+ opList = ["lt", "gt", "le", "ge", "eq", "ne"]
709+ opmatch = False
710+ for opstr in opList:
711+ if op == opstr:
712+ opmatch = True
713+ if not opmatch:
714+ if op == "lte":
715+ op = "le"
716+ elif op == "gte":
717+ op = "ge"
718+ else:
719+ pm_log.warn("parse_basic_settings(): " +
720+ "operation \"%s\" (in \"%s\") is invalid. " %
721+ (op, optname) +
722+ "Ignore the setting.")
723+ continue # To the next option in [Settings].
724+
725+ attrRule = [attrname, op, attrval]
726+ attrRuleList.append(attrRule)
727+ elif optname == self.OPT_LOGFACILITY:
728+ if LogconvLog.facility_map.has_key(optval.lower()):
729+ self.logfacility = LogconvLog.facility_map[optval.lower()]
730+ else:
731+ pm_log.warn("parse_basic_settings(): " +
732+ "the value of \"%s\" is invalid. " % (optname) +
733+ "Ignore the setting.")
734+ elif optname == self.OPT_ACTRSC:
735+ for rstr in optval.split(','):
736+ rstr = rstr.strip()
737+ if rstr != "":
738+ if rstr in actRscList:
739+ pm_log.warn("parse_basic_settings(): " +
740+ "resource id \"%s\" is written redundantly. " %
741+ (rstr) +
742+ "Ignore the redundancy.")
743+ else:
744+ actRscList.append(rstr)
745+ # __if optname == xxx:
746+ # __for optname in setting_opts:
747+
748+ return 0
749+
750+ '''
751+ Parse sections for log-convertion.
752+ return 0 : succeeded.
753+ 0 > : error occurs.
754+ '''
755+ def parse_logconv_settings(self):
756+ logconv_sections = self.cf.sections()
757+ try:
758+ logconv_sections.remove(self.SEC_SETTINGS)
759+ except:
760+ pm_log.warn("parse_logconv_settings(): " +
761+ "[%s] section does not exist. " % (self.SEC_SETTINGS))
762+
763+ #
764+ # Parse each section.
765+ #
766+ for secname in logconv_sections:
767+ # Get all options in the section.
768+ try:
769+ logconv_opts = self.cf.options(secname)
770+ except:
771+ pm_log.warn("parse_logconv_settings(): " +
772+ "[%s] section does not exist. " % (secname) +
773+ "Ignore this section.")
774+ continue #To the next section.
775+
776+ lconvfrm = LogconvFrame()
777+ lconvfrm.rulename = secname
778+ for optname in logconv_opts:
779+ optval = self.get_optval(secname, optname)
780+ if not optval:
781+ pm_log.warn("parse_logconv_settings(): " +
782+ "Ignore the setting of \"%s\"." % (optname))
783+ continue # To the next option.
784+
785+ if optname == self.OPT_FUNCNAME:
786+ defined = hasattr(LogConvertFuncs, optval)
787+ if defined == False:
788+ pm_log.error("parse_logconv_settings(): " +
789+ "function %s() specified in " % (optval) +
790+ "[%s] section is not defined." % (secname))
791+ break # Break off parsing this section.
792+ lconvfrm.func = optval
793+ elif optname == self.OPT_LOGLEVEL:
794+ lconvfrm.loglevel = optval
795+ elif optname == self.OPT_FOTRIGGER:
796+ lconvfrm.fotrigger = optval
797+ elif optname == self.OPT_IGNOREMSG:
798+ if optval.lower() == "true":
799+ lconvfrm.ignoremsg = True
800+ elif optval.lower() == "false":
801+ lconvfrm.ignoremsg = False
802+ else:
803+ pm_log.warn("parse_logconv_settings(): " +
804+ "the value of \"%s\" is invalid. " % (optname) +
805+ "Ignore the setting.")
806+ elif optname.startswith(self.OPT_PATTERN):
807+ pstrList = list()
808+ tmpList = list()
809+ pstrList = self.parse_ptn_strings(optval)
810+ if len(pstrList) <= 0:
811+ pm_log.error("parse_logconv_settings(): " +
812+ "match pattern string of \"%s\" is empty." %
813+ (optname))
814+ break # Break off parsing this section.
815+ tmpList = self.compile_ptn_strings(pstrList)
816+ if tmpList is None:
817+ pm_log.error("parse_logconv_settings(): " +
818+ "failed to compile the pattern string in \"%s\"." %
819+ (optname))
820+ break # Break off parsing this section.
821+ lconvfrm.ptnList.append(tmpList)
822+ else:
823+ pm_log.debug("parse_logconv_settings(): " +
824+ "\"%s\" is not valid option string." % (optname) +
825+ "Ignore the setting.")
826+ # __for optname in logconv_opts:
827+
828+ if len(lconvfrm.ptnList) == 0 or lconvfrm.func == None:
829+ pm_log.warn("parse_logconv_settings(): " +
830+ "\"%s\" and \"%s*\" setting is required in section [%s]. " %
831+ (self.OPT_FUNCNAME, self.OPT_PATTERN, secname) +
832+ "Ignore the section.")
833+ del lconvfrm
834+ else:
835+ lconvRuleList.append(lconvfrm)
836+ #To the next section.
837+ #__for secname in logconv_sections:
838+ return 0
839+
840+ '''
841+ Parse match pattern strings (written in a line) and
842+ make a list of them.
843+ Strings are set apart by ','.
844+ arg1 : match pattern strings.
845+ return: a list of pattern strings.
846+ '''
847+ def parse_ptn_strings(self, pstrings):
848+ pstrList = list()
849+ for pstr in pstrings.split(','):
850+ pstr = pstr.strip()
851+ if pstr != "":
852+ pstrList.append(pstr)
853+ return pstrList
854+
855+ '''
856+ Compile each pattern string.
857+ arg1 : a list of pattern strings (made with parse_ptn_strings()).
858+ return: a list of compiled objects.
859+ '''
860+ def compile_ptn_strings(self, pstrList):
861+ compiledList = list()
862+ for pstr in pstrList:
863+ #If it is a negative pattern, compile is as so.
864+ if pstr.startswith('!'):
865+ pstr = ur"^(?!.*" + pstr.lstrip('!') + ur").*$"
866+ compiledList.append(re.compile(pstr))
867+ return compiledList
868+
869+'''
870+ Class to hold rules to convert log message.
871+'''
872+class LogconvFrame:
873+ '''
874+ rulename : convert rule name. set section name.
875+ ptnList : list of compiled object list of match patterns
876+ (list of lists).
877+ func : function name to convert log message which matches the rule.
878+ loglevel : log level of converted log.
879+ fotrigger: the log message is trigger of F/O or not. [True|False]
880+ ignoremsg: wheter set the time of output log message for auto reset
881+ function. [True|False]
882+ '''
883+ def __init__(self, rulename=None, ptnList=None, func=None, loglevel=None,
884+ fotrigger=False, ignoremsg=False):
885+ self.rulename = rulename
886+ self.ptnList = ptnList
887+ self.ptnList = list()
888+ self.func = func
889+ self.loglevel = loglevel
890+ self.fotrigger = fotrigger
891+ self.ignoremsg = ignoremsg
892+
893+ '''
894+ Only for debug.
895+ '''
896+ def print_frmval(self):
897+ print self.rulename
898+ print self.ptnList
899+ print self.func
900+ print self.loglevel
901+ print self.fotrigger
902+ print self.ignoremsg
903+
904+class LogConvert:
905+ PIDFILE = "/var/run/pm_logconv.pid"
906+ STATFILE = "/var/run/pm_logconv.stat"
907+
908+ def __init__(self):
909+ self.daemonize = False
910+ self.stop_logconv = False
911+ self.ask_status = False
912+ self.is_continue = False
913+ self.is_present = False
914+ self.configfile = CONFIGFILE
915+ now = datetime.datetime.now()
916+ self.last_logoutput_t = now
917+ self.last_reset_t = now
918+
919+ # Get obj of functions to convert log.
920+ self.funcs = LogConvertFuncs()
921+ signal.signal(signal.SIGUSR1, self.check_dc_and_reset)
922+
923+ if not self.parse_args():
924+ sys.exit(1)
925+
926+ pm_log.debug("option: daemon[%d], stop[%d], status[%d], continue[%d], " \
927+ "present[%d], config[%s], facility[%s]" % (self.daemonize, self.stop_logconv,
928+ self.ask_status, self.is_continue, self.is_present, self.configfile, pm_log.facilitystr))
929+ if not self.stop_logconv and not self.ask_status:
930+ pm_log.debug("option: target[%s], output[%s], syslogfmt[%s], ha.cf[%s], hcache[%s], reset_interval[%d], actrsc%s" % (HA_LOGFILE, OUTPUTFILE, SYSLOGFORMAT, HACFFILE, HOSTCACHE, RESET_INTERVAL, actRscList))
931+
932+ '''
933+ PID and status(read position of ha-log and status of convert) file path
934+ is set as read-only.
935+ '''
936+ def __setattr__(self, name, val):
937+ if name != "PIDFILE" and name != "STATFILE":
938+ self.__dict__[name] = val
939+
940+ '''
941+ parse options - command line option and configure file.
942+ '''
943+ def parse_args(self):
944+ myusage = "\n%prog [options]"
945+ psr = OptionParser(usage=myusage)
946+
947+ psr.add_option("-d", action="store_true", dest="daemonize",
948+ default=False, help="make the program a daemon")
949+ psr.add_option("-k", action="store_true", dest="stop_logconv",
950+ default=False, help="stop the pm_logconv if it is already running")
951+ psr.add_option("-s", action="store_true", dest="ask_status",
952+ default=False, help="return pm_logconv status")
953+ psr.add_option("-c", action="store_true", dest="is_continue",
954+ default=False, help="start with a continuous mode (\"-p\" option is mutually exclusive)")
955+ psr.add_option("-p", action="store_true", dest="is_present",
956+ default=False, help="start with a present mode (\"-c\" option is mutually exclusive)")
957+ psr.add_option("-f", dest="config_file", default=CONFIGFILE,
958+ help="the specified configuration file is used")
959+ psr.add_option("-v", "--version", action="callback", callback=print_version,
960+ help="print out this program's version and exit")
961+
962+ opts = psr.parse_args(sys.argv)[0]
963+
964+ args = ''
965+ for arg in sys.argv:
966+ args = args + arg + ' '
967+ pm_log.info("starting... [%s]" % args[:len(args)-1])
968+
969+ self.daemonize = opts.daemonize
970+ self.stop_logconv = opts.stop_logconv
971+ self.ask_status = opts.ask_status
972+ self.is_continue = opts.is_continue
973+ self.is_present = opts.is_present
974+ self.configfile = opts.config_file
975+
976+ '''
977+ Parse config file.
978+ '''
979+ pcfobj = ParseConfigFile(self.configfile)
980+ # Parse pm_logconv's basic settings.
981+ pcfobj.parse_basic_settings()
982+
983+ if pcfobj.logfacility != None:
984+ pm_log.set_facility(pcfobj.logfacility)
985+ pm_log.info("starting... [%s]" % args[:len(args)-1])
986+
987+ # check command line option.
988+ true_opts = 0
989+ for opt in (self.daemonize, self.stop_logconv, self.ask_status):
990+ if opt:
991+ true_opts = true_opts + 1
992+ if true_opts > 1:
993+ pm_log.error("parse_args: option -d, -k, " \
994+ "and -s cannot be specified at the same time.")
995+ return False
996+
997+ if (self.stop_logconv or self.ask_status) and self.is_continue:
998+ pm_log.error("parse_args: option -k and -s cannot be specified with -c.")
999+ return False
1000+
1001+ if (self.stop_logconv or self.ask_status) and self.is_present:
1002+ pm_log.error("parse_args: option -k and -s cannot be specified with -p.")
1003+ return False
1004+
1005+ if self.is_continue and self.is_present:
1006+ pm_log.error("parse_args: options -c and -p are mutually exclusive.")
1007+ return False
1008+
1009+ if not self.is_continue and not self.is_present:
1010+ # check Heartbeat active or dead.
1011+ ret = self.funcs.is_heartbeat()
1012+ if ret == None:
1013+ return False
1014+ elif ret:
1015+ self.is_continue = True
1016+ else:
1017+ self.is_present = True
1018+
1019+ # check file path. isn't the same path specified?
1020+ try:
1021+ fileList = list()
1022+ if not self.stop_logconv and not self.ask_status:
1023+ fileList.append((OUTPUTFILE, "output file for converted message"))
1024+ fileList.append((HA_LOGFILE, "Pacemaker and Heartbeat log file"))
1025+ fileList.append((HACFFILE, "Heartbeat's configuration file"))
1026+ fileList.append((HOSTCACHE, "Heartbeat's hostcache file"))
1027+ fileList.append((self.STATFILE,
1028+ "pm_logconv's status file (can't specify by user)"))
1029+ fileList.append((self.configfile, "pm_logconv's configuration file"))
1030+ fileList.append((self.PIDFILE,
1031+ "pm_logconv's PID file (can't specify by user)"))
1032+
1033+ for i in range(0, len(fileList) - 1):
1034+ for j in range(i + 1, len(fileList)):
1035+ pathi, desci = tuple(fileList[i])
1036+ pathj, descj = tuple(fileList[j])
1037+ pm_log.debug("path check: [%s] [%s]"
1038+ % (os.path.realpath(pathi), os.path.realpath(pathj)))
1039+ if os.path.realpath(pathi) == os.path.realpath(pathj):
1040+ pm_log.error("parse_args: specified same path [%s] " \
1041+ "as \"%s\" and \"%s\"." % (pathi, desci, descj))
1042+ return False
1043+ except Exception, strerror:
1044+ pm_log.error("checking path: error occurred.")
1045+ pm_log.debug("checking path: error occurred. [%s]" % strerror)
1046+ return False
1047+
1048+ if not self.stop_logconv and not self.ask_status:
1049+ # Parse settings for log convertion.
1050+ pcfobj.parse_logconv_settings()
1051+ return True
1052+
1053+ '''
1054+ run in the background as a daemon, if option -d is specified.
1055+ and create PID file.
1056+ '''
1057+ def make_daemon(self, pidfile):
1058+ if self.daemonize:
1059+ try:
1060+ pid = os.fork()
1061+ if pid > 0:
1062+ sys.exit(0)
1063+ pm_log.debug("make_daemon: fork() #1 succeeded. pid[%d]" % os.getpid())
1064+ pm_log.pid = os.getpid()
1065+ except OSError, strerror:
1066+ pm_log.error("make_daemon: fork() #1 error occurred.")
1067+ pm_log.debug("make_daemon: fork() #1 error occurred. [%s]" % strerror)
1068+ sys.exit(1)
1069+
1070+ try:
1071+ os.setsid()
1072+ except OSError, strerror:
1073+ pm_log.error("make_daemon: setsid() error occurred.")
1074+ pm_log.debug("make_daemon: setsid() error occurred. [%s]" % strerror)
1075+ sys.exit(1)
1076+
1077+ try:
1078+ pid = os.fork()
1079+ if pid > 0:
1080+ sys.exit(0)
1081+ pm_log.debug("make_daemon: fork() #2 succeeded. pid[%d]" % os.getpid())
1082+ pm_log.pid = os.getpid()
1083+ except OSError, strerror:
1084+ pm_log.error("make_daemon: fork() #2 error occurred.")
1085+ pm_log.debug("make_daemon: fork() #2 error occurred. [%s]" % strerror)
1086+ sys.exit(1)
1087+
1088+ ret = pidfile.lock()
1089+ if ret > 0:
1090+ print >> sys.stderr, "pm_logconv: already running [pid %d]" % ret
1091+ pm_log.info("make_daemon: pm_logconv is already running [pid %d]" % ret)
1092+ sys.exit(0)
1093+ elif ret == pidfile.SYSTEM_ERROR:
1094+ pm_log.info("make_daemon: couldn't start pm_logconv.")
1095+ sys.exit(1)
1096+
1097+ if self.daemonize:
1098+ try:
1099+ os.chdir("/")
1100+ os.umask(0)
1101+ sys.stdin.close(); sys.stdin = None
1102+ sys.stdout.close(); sys.stdout = None
1103+ sys.stderr.close(); sys.stderr = None
1104+ os.close(0)
1105+ os.close(1)
1106+ os.close(2)
1107+ except:
1108+ pass
1109+ return True
1110+
1111+ '''
1112+ stop running pm_logconv.
1113+ return 0 : succeeded. or already stopped.
1114+ 1 : error occurred. it may not have stopped...
1115+ '''
1116+ def logconv_stop(self, pidfile):
1117+ logconv_pid = pidfile.read()
1118+ if logconv_pid <= 0:
1119+ if logconv_pid == pidfile.SYSTEM_ERROR:
1120+ pm_log.info("logconv_stop: couldn't try to stop pm_logconv.")
1121+ return 1
1122+ elif logconv_pid == pidfile.FILE_NOTEXIST:
1123+ pm_log.info("logconv_stop: couldn't try to stop pm_logconv.")
1124+ return 0
1125+ elif logconv_pid == pidfile.FILE_INVALID:
1126+ pm_log.info("logconv_stop: couldn't try to stop pm_logconv.")
1127+ return 1
1128+ elif logconv_pid == pidfile.NOTRUNNING:
1129+ pm_log.info("logconv_stop: pm_logconv already stopped.")
1130+ return 0
1131+ return 1
1132+
1133+ pm_log.info("logconv_stop: stopping pm_logconv with pid [%d]." % logconv_pid)
1134+ try:
1135+ os.kill(logconv_pid, signal.SIGTERM)
1136+
1137+ # wait for the running pm_logconv to die.
1138+ pm_log.info("logconv_stop: waiting for pid [%d] to exit." % logconv_pid)
1139+
1140+ while 1:
1141+ os.kill(logconv_pid, 0)
1142+ time.sleep(1)
1143+ except Exception, (errNo, strerror):
1144+ if errNo != ESRCH:
1145+ pm_log.warn("logconv_stop: pid %d not killed." % logconv_pid)
1146+ pm_log.debug("logconv_stop: pid %d not killed. [%s]"
1147+ % (logconv_pid, strerror))
1148+ return 1
1149+ else:
1150+ pm_log.info("logconv_stop: pid %ld exited." % logconv_pid)
1151+ return 0
1152+
1153+ '''
1154+ get file descriptor which matched the contents of the status file
1155+ (read position of ha-log).
1156+ '''
1157+ def get_fd(self, statfile):
1158+ try:
1159+ if self.is_continue:
1160+ if statfile.read() and cstat.ino == 0:
1161+ pm_log.error("get_fd: status file doesn't exist.")
1162+
1163+ if cstat.ino > 0:
1164+ if os.path.exists(HA_LOGFILE) and \
1165+ cstat.ino == os.stat(HA_LOGFILE)[ST_INO]:
1166+ log = HA_LOGFILE
1167+ else:
1168+ # ha-log's inode didn't match, logrotate?
1169+ # look for the file which inode matches.
1170+ for log in glob.glob(HA_LOGFILE + "?*"):
1171+ if cstat.ino == os.stat(log)[ST_INO]:
1172+ break
1173+ else:
1174+ pm_log.warn("get_fd: Pacemaker and Heartbeat log" \
1175+ "(inode:%d) doesn't exist." % cstat.ino)
1176+ log = None
1177+ statfile.clear_cstat()
1178+
1179+ if log != None:
1180+ f = open(log, 'r')
1181+ if os.fstat(f.fileno()).st_size >= cstat.offset:
1182+ f.seek(cstat.offset)
1183+ else:
1184+ pm_log.warn("get_fd: there is possibility that " \
1185+ "Pacemaker and Heartbeat log was clear.")
1186+ pm_log.debug("get_fd: reset offset, since " \
1187+ "offset[%d] > file size[%d]"
1188+ % (cstat.offset, os.fstat(f.fileno()).st_size))
1189+ cstat.offset = 0
1190+ self.funcs.clear_status()
1191+ pm_log.info("get_fd: target to convert [%s(inode:%d)]"
1192+ % (log, cstat.ino))
1193+ return f
1194+
1195+ if os.path.exists(HA_LOGFILE):
1196+ f = open(HA_LOGFILE, 'r')
1197+ if not self.is_continue:
1198+ f.seek(os.fstat(f.fileno()).st_size)
1199+ else:
1200+ while not os.path.exists(HA_LOGFILE):
1201+ if do_shutdown:
1202+ return None
1203+ time.sleep(1)
1204+ f = open(HA_LOGFILE, 'r')
1205+ pm_log.info("get_fd: target to convert [%s(inode:%d)]"
1206+ % (HA_LOGFILE, os.fstat(f.fileno()).st_ino))
1207+ return f
1208+ except Exception, strerror:
1209+ pm_log.error("get_fd: I/O error occurred.")
1210+ pm_log.debug("get_fd: I/O error occurred. [%s]" % strerror)
1211+ statfile.clear_cstat()
1212+ return None
1213+
1214+ '''
1215+ get the Pacemaker and Heartbeat log path, when `logrotate` occurs.
1216+ '''
1217+ def get_nextlog(self, ino, statfile):
1218+ try:
1219+ for log in glob.glob(HA_LOGFILE + "?*"):
1220+ pm_log.debug("get_nextlog: searching previous target[%s(inode:%d)]"
1221+ % (log, os.stat(log)[ST_INO]))
1222+ if ino == os.stat(log)[ST_INO]:
1223+ pm_log.debug("get_nextlog: searching.. found it[%s].size[%d]"
1224+ % (log, os.stat(log)[ST_SIZE]))
1225+ break
1226+ else:
1227+ pm_log.warn("get_nextlog: target(inode:%d) was lost. " \
1228+ "there is possibility that file was remove." % ino)
1229+ statfile.clear_cstat()
1230+ return None
1231+
1232+ except Exception, strerror:
1233+ pm_log.warn("get_nextlog: error occurred.")
1234+ pm_log.debug("get_nextlog: error occurred. [%s]" % strerror)
1235+ statfile.clear_cstat()
1236+ return None
1237+
1238+ '''
1239+ Check DC node is idle or not with crmadmin command.
1240+ When DC is idle, crmadmin returns "S_IDLE" status.
1241+ return: True -> DC is idle.
1242+ False -> DC is not idle.
1243+ None -> error occurs.
1244+ cannot execute command or maybe during DC election.
1245+ '''
1246+ def is_idle(self):
1247+ # Connection timeout (ms).
1248+ # crmadmin command's default value is 30sec.
1249+ TIMEOUT = 30 * 1000
1250+
1251+ # Heartbeat status check
1252+ if self.funcs.is_heartbeat() != True:
1253+ return False
1254+
1255+ # Get DC node name.
1256+ options = ("-D -t %s" % (TIMEOUT))
1257+ (status, output) = \
1258+ self.funcs.exec_outside_cmd(CMD_CRMADMIN, options, False)
1259+ if status == None:
1260+ # Failed to exec command.
1261+ pm_log.warn("is_idle(): failed to get DC node name.")
1262+ return None
1263+ if status != 0:
1264+ # Maybe during DC election.
1265+ return False
1266+ try:
1267+ dcnode = output.split()[-1]
1268+ except:
1269+ # Failed to parse output strings.
1270+ pm_log.warn("is_idle(): failed to parse output strings." +
1271+ "(DC node name)")
1272+ return None
1273+
1274+ # Get DC status.
1275+ options = ("-S %s -t %s" % (dcnode, TIMEOUT))
1276+ (status, output) = \
1277+ self.funcs.exec_outside_cmd(CMD_CRMADMIN, options, False)
1278+ if status == None:
1279+ # Failed to exec command.
1280+ pm_log.warn("is_idle(): failed to get DC node status.")
1281+ return None
1282+ if status != 0:
1283+ # Maybe during DC election.
1284+ return False
1285+ try:
1286+ dcstat = output.split()[-2]
1287+ except:
1288+ # Failed to parse output strings.
1289+ pm_log.warn("is_idle(): failed to parse output strings." +
1290+ "DC node status")
1291+ return None
1292+ if dcstat == "S_IDLE":
1293+ return True
1294+ return False
1295+
1296+ '''
1297+ Reset log convert status when Pacemaker doesn't output any log message
1298+ over RESET_INTERVAL sec.
1299+ Before reset process, check whether DC node is idle or not.
1300+ arg1 : signal number. for use this func as signal handler.
1301+ arg2 : stac frame. for use this func as signal handler.
1302+ return nothing.
1303+ '''
1304+ def check_dc_and_reset(self, signum, frame):
1305+ if signum == None:
1306+ now = datetime.datetime.now()
1307+ if ((self.last_logoutput_t +
1308+ datetime.timedelta(seconds=RESET_INTERVAL)) > now) or \
1309+ ((self.last_reset_t +
1310+ datetime.timedelta(seconds=RESET_INTERVAL)) > now):
1311+ return
1312+ if signum == None:
1313+ self.last_reset_t = datetime.datetime.now()
1314+ pm_log.debug("check_dc_and_reset(): try to reset log convert status.")
1315+ self.funcs.debug_status()
1316+ ret = self.is_idle()
1317+ if ret == True:
1318+ self.funcs.clear_status()
1319+ pm_log.debug("check_dc_and_reset(): " +
1320+ "reset log convert status complete.")
1321+ if statfile: statfile.write()
1322+ elif ret == False:
1323+ pm_log.debug("check_dc_and_reset(): DC node is not idle. " +
1324+ "Avoid to reset log convert status.")
1325+ elif ret == None:
1326+ pm_log.error("check_dc_and_reset(): failed to check DC status. " +
1327+ "Avoid to reset log convert status.")
1328+ return
1329+
1330+ '''
1331+ Check a line of log message matched or not matched with each re-objects.
1332+ NOTE: pattern strings which are written in a line (in a option which is
1333+ named "pattern*") are treated as "AND condition".
1334+ If one section has two or more options named "pattern*",
1335+ these are treated as "OR condition".
1336+ ex.)
1337+ pattern1 = aa, bb
1338+ pattern2 = cc, dd
1339+ means
1340+ "if (($0 ~ /aa/) && ($0 ~ /bb/) || ($0 ~ /cc/) && ($0 ~ /dd/))"
1341+ True : matched
1342+ False : not matched
1343+ None : error occurs.
1344+ '''
1345+ def is_matched(self, logline, lconvfrm):
1346+ matched = False
1347+ for ptnobjList in lconvfrm.ptnList:
1348+ # Matching with each re-object which came from strings
1349+ # written in a option "pattern*"
1350+ matchcnt = 0
1351+ for ptnobj in ptnobjList:
1352+ try:
1353+ if ptnobj.search(logline) != None:
1354+ matchcnt += 1
1355+ except Exception, strerror:
1356+ # Error occurs.
1357+ pm_log.debug("is_matched(): %s" % (strerror))
1358+ return None
1359+ if matchcnt == len(ptnobjList):
1360+ # If the log message matched with all object in a pattern line,
1361+ # it is a target log message to convert.
1362+ matched = True
1363+ break
1364+ # If not matched with objects in a pattern line,
1365+ # continue to check with the next line.
1366+ return matched
1367+
1368+ '''
1369+ Check the log message is a target to convert or not
1370+ with all rules which are specified in config file.
1371+ and call specified function when a target log message appears.
1372+ return nothing
1373+ '''
1374+ def do_ptn_matching(self, logline):
1375+ setdate = True
1376+ for lconvfrm in lconvRuleList:
1377+ matched = self.is_matched(logline, lconvfrm)
1378+ if matched == True:
1379+ logelm = LogElements()
1380+ if logelm.parse_logmsg(logline, self.funcs) != 0:
1381+ pm_log.error("do_ptn_matching(): " +
1382+ "failed to parse log message. [%s]" % (logline))
1383+ # Set the time of output log message for auto reset.
1384+ self.last_logoutput_t = datetime.datetime.now()
1385+ return # Break off converting this log message.
1386+ # Set original date string and log level.
1387+ outputobj = OutputConvertedLog()
1388+ outputobj.set_datestr(logelm.datestr)
1389+ outputobj.set_orgloglevel(logelm.haloglevel)
1390+ outputobj.set_orglogmsg(logelm.halogmsg)
1391+
1392+ # Call specified function.
1393+ try:
1394+ pm_log.debug("do_ptn_matching(): execute %s()." %
1395+ (lconvfrm.func))
1396+ ret = getattr(self.funcs, lconvfrm.func)(\
1397+ outputobj, logelm, lconvfrm)
1398+ except Exception, strerror:
1399+ pm_log.error("do_ptn_matching(): " +
1400+ "failed to execute %s()." % (lconvfrm.func))
1401+ pm_log.debug("do_ptn_matching(): %s" % (strerror))
1402+ continue # To check next rule.
1403+
1404+ if ret == CONV_OK:
1405+ # convertion succeeded.
1406+ # If the log is a trigger of FailOver, tell to funcs.
1407+ if lconvfrm.fotrigger:
1408+ cstat.FAILURE_OCCURRED = lconvfrm.fotrigger
1409+ # FailOver pattern
1410+ # resource failer + resource move
1411+ # score failer + resource move
1412+ # node failer + resource start
1413+ # resource failer + resource stop
1414+ # score failer + resource stop
1415+ # node failer + resource stopped
1416+ if \
1417+ (cstat.FAILURE_OCCURRED == FAIL_RSC and cstat.ACTRSC_MOVE == FAIL_MOVE) or \
1418+ (cstat.FAILURE_OCCURRED == FAIL_SCORE and cstat.ACTRSC_MOVE == FAIL_MOVE) or \
1419+ (cstat.FAILURE_OCCURRED == FAIL_NODE and cstat.ACTRSC_MOVE == FAIL_STR) or \
1420+ (cstat.FAILURE_OCCURRED == FAIL_RSC and cstat.ACTRSC_MOVE == FAIL_STP) or \
1421+ (cstat.FAILURE_OCCURRED == FAIL_SCORE and cstat.ACTRSC_MOVE == FAIL_STP) or \
1422+ (cstat.FAILURE_OCCURRED == FAIL_NODE and cstat.ACTRSC_MOVE == FAIL_STPD):
1423+ self.funcs.detect_fo_start(outputobj)
1424+ if lconvfrm.ignoremsg:
1425+ setdate = False
1426+ elif ret == CONV_SHUT_NODE:
1427+ continue
1428+ else:
1429+ if ret == CONV_PARSE_ERROR:
1430+ errmsg = ("%s(): " % (lconvfrm.func) +
1431+ "failed to parse log message. [%s]" %
1432+ (logelm.halogmsg))
1433+ elif ret == CONV_ITEM_EMPTY:
1434+ errmsg = ("%s(): " % (lconvfrm.func) +
1435+ "invalid log message format. [%s]" %
1436+ (logelm.halogmsg))
1437+ elif ret == CONV_GETINFO_ERROR:
1438+ errmsg = ("%s(): " % (lconvfrm.func) +
1439+ "failed to get some information to output log. " +
1440+ "[%s]" % (logelm.halogmsg))
1441+ else:
1442+ errmsg = ("%s(): " % (lconvfrm.func) +
1443+ "unknown error occurred. " +
1444+ "[%s]" % (logelm.halogmsg))
1445+ # When log convertion failed, output original message.
1446+ pm_log.error(errmsg)
1447+ outputobj.output_log(lconvfrm.loglevel, None)
1448+ elif matched == None:
1449+ pm_log.error("do_ptn_matching(): " +
1450+ "pattern matching about [%s] failed." %
1451+ (lconvfrm.rulename))
1452+ else:
1453+ # Not matched.
1454+ pass
1455+ #__for lconvfrm in lconvRuleList: (check next rule)
1456+
1457+ # Set the time of output log message for auto reset.
1458+ if setdate:
1459+ self.last_logoutput_t = datetime.datetime.now()
1460+ return
1461+
1462+ '''
1463+ read the Pacemaker and Heartbeat log and convert it.
1464+ '''
1465+ def convert(self):
1466+ global statfile
1467+ try:
1468+ statfile = StatusFile(self.STATFILE)
1469+ logfile = self.get_fd(statfile)
1470+ if logfile == None:
1471+ if do_shutdown:
1472+ return 0
1473+ return 1
1474+ cstat.ino = os.fstat(logfile.fileno()).st_ino
1475+
1476+ while 1:
1477+ logline = logfile.readline()
1478+ cstat.offset = logfile.tell()
1479+
1480+ if not logline:
1481+ self.check_dc_and_reset(None, None)
1482+
1483+ if cstat.ino != statfile.w_ino or \
1484+ cstat.offset != statfile.w_offset:
1485+ statfile.write()
1486+
1487+ if os.fstat(logfile.fileno()).st_size < cstat.offset:
1488+ pm_log.warn("convert: there is possibility that " \
1489+ "Pacemaker and Heartbeat log was clear.")
1490+ pm_log.debug("convert: reset offset, since " \
1491+ "offset[%d] > file size[%d]" % (cstat.offset,
1492+ os.fstat(logfile.fileno()).st_size))
1493+ logfile.seek(0)
1494+ cstat.offset = 0
1495+ self.funcs.clear_status()
1496+ statfile.write()
1497+
1498+ if os.path.exists(HA_LOGFILE) and \
1499+ cstat.ino == os.stat(HA_LOGFILE)[ST_INO]:
1500+ if do_shutdown:
1501+ logfile.close()
1502+ return 0
1503+ time.sleep(1)
1504+ continue
1505+ logfile.close()
1506+
1507+ path = self.get_nextlog(cstat.ino, statfile)
1508+ if path == None:
1509+ path = HA_LOGFILE
1510+ while not os.path.exists(path):
1511+ if do_shutdown:
1512+ return 0
1513+ time.sleep(1)
1514+ pm_log.info("convert: change target[%s(inode:%d)]"
1515+ % (path, os.stat(path)[ST_INO]))
1516+ logfile = open(path, 'r')
1517+ cstat.ino = os.fstat(logfile.fileno()).st_ino
1518+ else:
1519+ self.do_ptn_matching(logline)
1520+ statfile.write()
1521+ except Exception, strerror:
1522+ pm_log.error("convert: error occurred.")
1523+ pm_log.debug("convert: error occurred. [%s]" % strerror)
1524+ return 1
1525+
1526+ '''
1527+ main method.
1528+ '''
1529+ def main(self):
1530+ signal.alarm(0)
1531+ pidfile = PIDFile(self.PIDFILE)
1532+
1533+ if self.ask_status:
1534+ ret = pidfile.read()
1535+ if ret > 0:
1536+ pm_log.info("status: pm_logconv is running [pid = %d]" % ret)
1537+ return 0
1538+ elif ret == pidfile.FILE_NOTEXIST or ret == pidfile.NOTRUNNING:
1539+ pm_log.info("status: pm_logconv is stopped.")
1540+ return 1
1541+ else:
1542+ pm_log.info("status: couldn't check status of pm_logconv.")
1543+ return 2
1544+
1545+ if self.stop_logconv:
1546+ return self.logconv_stop(pidfile)
1547+
1548+ self.make_daemon(pidfile)
1549+ time.sleep(1)
1550+ pm_log.info("started: pid[%d], ppid[%d], pgid[%d]"
1551+ % (os.getpid(), os.getppid(), os.getpgrp()))
1552+ return self.convert()
1553+
1554+class LogElements:
1555+ def __init__(self, procname=None, datestr=None,
1556+ haloglevel=None, halogmsg=None):
1557+ self.procname = procname
1558+ self.datestr = datestr
1559+ self.haloglevel = haloglevel
1560+ self.halogmsg = halogmsg
1561+
1562+ '''
1563+ Divide ha-log message into process-name, date-string, log-level, and
1564+ log-message.
1565+ arg1 : a line of log message.
1566+ return: 0 -> succeeded.
1567+ 0 > -> error occurrs.
1568+ '''
1569+ def parse_logmsg(self, logline, funcs):
1570+ SYSFMT_PROC_POS = 4
1571+ SYSFMT_DATE_START_POS = 0
1572+ SYSFMT_DATE_END_POS = 2 + 1
1573+ SYSFMT_LOGLV_POS = 6
1574+
1575+ HBFMT_PROC_POS = 0
1576+ HBFMT_DATE_POS = 1
1577+ HBFMT_LOGLV_POS = 2
1578+
1579+ try:
1580+ elementList = logline.split()
1581+ if elementList[0].isalpha():
1582+ # Case of syslogmsgfmt = True (default)
1583+ pm_log.debug("parse log message as syslog format.")
1584+ self.datestr = ' '.join(elementList[SYSFMT_DATE_START_POS:SYSFMT_DATE_END_POS])
1585+ self.procname = funcs.trimmark(elementList[SYSFMT_PROC_POS])
1586+ self.haloglevel = funcs.trimmark(elementList[SYSFMT_LOGLV_POS])
1587+ msgpos = SYSFMT_LOGLV_POS + 1
1588+ self.halogmsg = ' '.join(elementList[msgpos:]).strip()
1589+ else:
1590+ # Case of syslogmsgfmt = False
1591+ pm_log.debug("parse log message as ha-log format.")
1592+ self.procname = elementList[HBFMT_PROC_POS].split('[')[0]
1593+ self.datestr = elementList[HBFMT_DATE_POS]
1594+ self.haloglevel = funcs.trimmark(elementList[HBFMT_LOGLV_POS])
1595+ msgpos = HBFMT_LOGLV_POS + 1
1596+ self.halogmsg = ' '.join(elementList[msgpos:])
1597+
1598+ return 0
1599+ except Exception, strerror:
1600+ pm_log.debug("parse_logmsg(): %s" % (strerror))
1601+ return -1
1602+
1603+ '''
1604+ Only for debug.
1605+ '''
1606+ def print_logelements(self):
1607+ print self.procname
1608+ print self.datestr
1609+ print self.haloglevel
1610+ print self.halogmsg
1611+
1612+'''
1613+ Class for output converted log message.
1614+'''
1615+class OutputConvertedLog:
1616+ def __init__(self, datestr=None, loglevel=None, logmsg=None):
1617+ self.datestr = datestr
1618+ self.loglevel = loglevel
1619+ self.logmsg = logmsg
1620+ self.monthnumDic = {
1621+ '01':'Jan',
1622+ '02':'Feb',
1623+ '03':'Mar',
1624+ '04':'Apr',
1625+ '05':'May',
1626+ '06':'Jun',
1627+ '07':'Jul',
1628+ '08':'Aug',
1629+ '09':'Sep',
1630+ '10':'Oct',
1631+ '11':'Nov',
1632+ '12':'Dec'
1633+ }
1634+ self.monthstrDic = {
1635+ 'Jan':'01',
1636+ 'Feb':'02',
1637+ 'Mar':'03',
1638+ 'Apr':'04',
1639+ 'May':'05',
1640+ 'Jun':'06',
1641+ 'Jul':'07',
1642+ 'Aug':'08',
1643+ 'Sep':'09',
1644+ 'Oct':'10',
1645+ 'Nov':'11',
1646+ 'Dec':'12'
1647+ }
1648+
1649+ def set_datestr(self, datestr):
1650+ if SYSLOGFORMAT:
1651+ tmp_datestr = self.to_syslog_dateformat(datestr)
1652+ else:
1653+ tmp_datestr = self.to_halog_dateformat(datestr)
1654+
1655+ if tmp_datestr != None:
1656+ self.datestr = tmp_datestr
1657+ else:
1658+ pm_log.error("set_datestr(): " +
1659+ "invalid date format. [%s] " % (datestr) +
1660+ "output in original format.")
1661+ self.datestr = datestr
1662+
1663+ def set_orgloglevel(self, loglevel):
1664+ self.orgloglevel = loglevel
1665+
1666+ def set_orglogmsg(self, logmsg):
1667+ self.orglogmsg = logmsg
1668+
1669+ '''
1670+ Output log message.
1671+ loglevel and log message is variable, but date is not
1672+ (output original log's date).
1673+ arg1 : loglevel string.
1674+ arg2 : log message
1675+ return: 0 -> succeeded.
1676+ 0 > -> error occurrs.
1677+ '''
1678+ def output_log(self, convloglevel, convlogmsg):
1679+ output_loglevel = self.orgloglevel
1680+ if convloglevel != None:
1681+ output_loglevel = convloglevel
1682+ output_logmsg = self.orglogmsg
1683+ if convlogmsg != None:
1684+ output_logmsg = convlogmsg
1685+
1686+ try:
1687+ outputstr = ("%s %s %s: %s" %
1688+ (self.datestr, HOSTNAME, output_loglevel, output_logmsg))
1689+ f = open(OUTPUTFILE, 'a')
1690+ f.write("%s\n" % (outputstr))
1691+ f.close()
1692+ except Exception, strerror:
1693+ pm_log.error("output_log(): " +
1694+ "failed to output converted log message. [%s]" %
1695+ (outputstr))
1696+ pm_log.debug("output_log(): %s" % (strerror))
1697+ return -1
1698+ return 0
1699+
1700+ '''
1701+ Convert dateformat form ha-log format to syslog format.
1702+ "2009/01/01_00:00:00" -> "Jan 1 00:00:00"
1703+ arg1 : date string of ha-log format.
1704+ return : date string which is converted to syslog format.
1705+ None -> error occurs.
1706+ '''
1707+ def to_syslog_dateformat(self, orgdatestr):
1708+ DATE_POS = 0 #YYYY/MM/DD
1709+ TIME_POS = 1 #hh:mm:ss
1710+ MONTH_POS = 1 #MM
1711+ DAY_POS = 2 #DD
1712+
1713+ if orgdatestr.split()[0].isalpha():
1714+ pm_log.debug("It seems already syslog date format.")
1715+ return orgdatestr
1716+
1717+ try:
1718+ datestr = orgdatestr.split('_')[DATE_POS].strip()
1719+ timestr = orgdatestr.split('_')[TIME_POS].strip()
1720+ if datestr == "" or timestr == "":
1721+ return None
1722+
1723+ monthstr = datestr.split('/')[MONTH_POS].strip()
1724+ daystr = datestr.split('/')[DAY_POS].strip().lstrip('0')
1725+ if monthstr == "" or daystr == "":
1726+ return None
1727+ if monthstr in self.monthnumDic == False:
1728+ return None
1729+ monthstr = self.monthnumDic[monthstr]
1730+ syslog_datestr = ("%s %s %s" % (monthstr, daystr, timestr))
1731+ return syslog_datestr
1732+ except Exception, strerror:
1733+ pm_log.debug("to_syslog_dateformat(): %s" % (strerror))
1734+ return None
1735+
1736+ '''
1737+ Convert dateformat form syslog format to ha-log format.
1738+ "Jan 1 00:00:00" -> "2009/01/01_00:00:00"
1739+ arg1 : date string of syslog format.
1740+ return : date string which is converted to ha-log original format.
1741+ None -> error occurs.
1742+ '''
1743+ def to_halog_dateformat(self, orgdatestr):
1744+ MONTH_POS = 0
1745+ DAY_POS = 1
1746+ TIME_POS = 2
1747+
1748+ strList = orgdatestr.split()
1749+ if strList[0].isalpha() == False:
1750+ pm_log.debug("It seems already ha-log date format.")
1751+ return orgdatestr
1752+ try:
1753+ monthstr = strList[MONTH_POS].strip()
1754+ daystr = strList[DAY_POS].strip()
1755+ timestr = strList[TIME_POS].strip()
1756+ if monthstr == "" or daystr == "" or timestr == "":
1757+ return None
1758+ if monthstr in self.monthstrDic == False:
1759+ return None
1760+ monthstr = self.monthstrDic[monthstr]
1761+ now = datetime.datetime.now()
1762+ yearstr = str(now.timetuple().tm_year)
1763+ hblog_datestr = ("%s/%s/%02d_%s" %
1764+ (yearstr, monthstr, int(daystr), timestr))
1765+
1766+ # If date string is future, minus year value.
1767+ hblog_date = datetime.datetime(\
1768+ *time.strptime(hblog_datestr, "%Y/%m/%d_%H:%M:%S")[0:6])
1769+ if hblog_date > now:
1770+ year = int(yearstr) - 1
1771+ hblog_datestr = hblog_datestr.replace(yearstr, str(year), 1)
1772+
1773+ return hblog_datestr
1774+ except Exception, strerror:
1775+ pm_log.debug("to_halog_dateformat(): %s" % (strerror))
1776+ return None
1777+
1778+'''
1779+ Class to hold resource status in F/O process.
1780+'''
1781+class RscStat:
1782+ '''
1783+ rscid : resource id.
1784+ status : [Started on node|Stopped]
1785+ fofailed : True -> F/O failed. ("cannot run anywhere" appeared.)
1786+ False -> "cannot run anywhere" didn't appear.
1787+ unmanaged: True -> resource is unmanaged.
1788+ False -> resource is managed.
1789+ '''
1790+ def __init__(self, rscid=None, status=None, fofailed=False,
1791+ unmanaged=False):
1792+ self.rscid = rscid
1793+ self.status = status
1794+ self.fofailed = fofailed
1795+ self.unmanaged = unmanaged
1796+
1797+ ''' operator eq '''
1798+ def __eq__(self,other):
1799+ return (self.rscid == other.rscid)
1800+
1801+ ''' replace status and flags'''
1802+ def replace(self,new):
1803+ if new.status:
1804+ self.status = new.status
1805+ if new.fofailed:
1806+ self.fofailed = new.fofailed
1807+ if new.unmanaged:
1808+ self.unmanaged = new.unmanaged
1809+
1810+ '''
1811+ Only for debug.
1812+ '''
1813+ def print_rscstat(self):
1814+ print "rsc:%s\tstatus:%s\tfofailed:%s\tunmanaged:%s\t" % (self.rscid,self.status,self.fofailed,self.unmanaged)
1815+# print self.rscid
1816+# print self.status
1817+# print self.fofailed
1818+# print self.unmanaged
1819+
1820+'''
1821+ Return codes for functions to convert log.
1822+'''
1823+CONV_SHUT_NODE = 1 #shutdown list existed.
1824+CONV_OK = 0 #log conversion succeeded.
1825+CONV_PARSE_ERROR = -1 #failed to parse log message.
1826+CONV_ITEM_EMPTY = -2 #parsing succeeded, but some gotten items are empty.
1827+CONV_GETINFO_ERROR = -3 #failed to get info which is required to conversion.
1828+'''
1829+ Class for functions to convert log message.
1830+ convert-functions' arguments are:
1831+ arg1: outputobj -> object for output converted log.
1832+ arg2: logelm -> elements which constructs target log. date, msg etc.
1833+ arg3: lconvfrm -> info for conversion. loglevel, F/Otrigger etc.
1834+ return codes are:
1835+ [CONV_OK|CONV_PARSE_ERROR|CONV_ITEM_EMPTY|CONV_GETINFO_ERROR]
1836+ See the head of this file.
1837+'''
1838+class LogConvertFuncs:
1839+ LOG_ERR_LV = "ERROR"
1840+ LOG_WARN_LV = "WARN"
1841+ LOG_INFO_LV = "info"
1842+ LOG_DEBUG_LV = "debug"
1843+
1844+ def __init__(self, rscstatList=None):
1845+ # This list is used only in F/O process.
1846+ # If hg_logconv exits abnormally during parsing F/O process's log,
1847+ # read from start of F/O, so it doesn't need to output status file.
1848+ self.rscstatList = rscstatList
1849+ self.rscstatList = list()
1850+
1851+ '''
1852+ Check Heartbeat service is active or dead.
1853+ return: True -> active
1854+ False -> dead
1855+ None -> error occurs.
1856+ '''
1857+ def is_heartbeat(self):
1858+ # Get DC node name.
1859+ status = self.exec_outside_cmd("service", "heartbeat status", False)[0]
1860+ if status == None:
1861+ # Failed to exec command.
1862+ pm_log.warn("is_heartbeat(): failed to get status.")
1863+ return None
1864+ if status != 0:
1865+ # Maybe during DC election.
1866+ return False
1867+ return True
1868+
1869+ '''
1870+ triming mark from value.
1871+ '''
1872+ def trimmark(self, word, minus=None):
1873+ marklist = "(),.;:[]=<>'"
1874+ if minus:
1875+ markset = set(marklist) - set(minus)
1876+ marklist = "".join(markset)
1877+ trimword = word.translate(string.maketrans("",""),marklist)
1878+ return trimword
1879+
1880+ '''
1881+ Check specified strings are empty or not.
1882+ arg* : target strings.
1883+ return : True -> there is at least an empty string
1884+ in specified strings.
1885+ False -> there is no empty string in specified strings.
1886+ '''
1887+ def is_empty(self, *args):
1888+ for arg in args:
1889+ if arg == "":
1890+ return True
1891+ return False
1892+
1893+ '''
1894+ Get node dictionary from hostcache.
1895+ the dic's key is uuid, and its value is nodename.
1896+ return : node dictionary in the cluster.
1897+ None -> error occurs.
1898+ '''
1899+ def get_nodedic(self):
1900+ HOSTNAME_POS = 0
1901+ UUID_POS = 1
1902+
1903+ nodeDic = dict()
1904+ try:
1905+ f = open (HOSTCACHE, 'r')
1906+ while 1:
1907+ nodeinfo = f.readline()
1908+ if not nodeinfo:
1909+ break
1910+ else:
1911+ nodename = nodeinfo.split()[HOSTNAME_POS]
1912+ uuid = nodeinfo.split()[UUID_POS]
1913+ nodeDic[uuid] = nodename
1914+ f.close()
1915+ except:
1916+ pm_log.error("get_nodedic(): " +
1917+ "failed to get node list from hostcache [%s]." % (HOSTCACHE))
1918+ return None
1919+ return nodeDic
1920+
1921+ '''
1922+ Get nodename from uuid.
1923+ arg1 : target uuid.
1924+ return : name string of the node which has specified uuid.
1925+ None -> error occurs.
1926+ '''
1927+ def get_nodename(self, uuid):
1928+ nodeDic = self.get_nodedic()
1929+ if nodeDic == None:
1930+ return None
1931+ if uuid not in nodeDic.keys():
1932+ return None
1933+ return nodeDic[uuid]
1934+
1935+ '''
1936+ Parse operation id (resourceid_opname_interval)
1937+ arg1 : operationid
1938+ return : resourceid, opname, interval
1939+ '''
1940+ def parse_opid(self, opid):
1941+ # please detect parse error in caller.
1942+ tmp = opid.split('_')
1943+ rscid = '_'.join(tmp[:-2])
1944+ op = tmp[-2]
1945+ interval = tmp[-1]
1946+ return rscid, op, interval
1947+
1948+ '''
1949+ Execute commandline command.
1950+ arg1 : command name to execute.
1951+ arg2 : command options.
1952+ arg3 : check return code or not.
1953+ return : [status, output]
1954+ status -> exit status.
1955+ output -> output strings of the command.
1956+ None -> error occurs.
1957+ '''
1958+ def exec_outside_cmd(self, cmdname, options, checkrc):
1959+ # Get full path of specified command.
1960+ try:
1961+ status, cmdpath = \
1962+ commands.getstatusoutput("which " + cmdname)
1963+ except Exception, strerror:
1964+ pm_log.error("exec_outside_cmd(): " +
1965+ "failed to execute which command to get command path. " +
1966+ "[%s]" % (cmdname))
1967+ pm_log.debug("exec_outside_cmd(): %s" % (strerror))
1968+ return None, None
1969+ if (os.WIFEXITED(status) == False or os.WEXITSTATUS(status) != 0):
1970+ pm_log.error("exec_outside_cmd(): " +
1971+ "failed to get command path. [%s]" % (cmdname))
1972+ return None, None
1973+
1974+ # Check whether it is able to execute the command.
1975+ if os.access(cmdpath, os.F_OK | os.X_OK) == False:
1976+ return None, None
1977+
1978+ # Execute command.
1979+ exec_cmd = ("%s %s" % (cmdpath, options))
1980+ pm_log.debug("exec_outside_cmd(): " +
1981+ "execute command. [%s]" % (exec_cmd))
1982+ try:
1983+ status, output = commands.getstatusoutput(exec_cmd)
1984+ except Exception, strerror:
1985+ pm_log.error("exec_outside_cmd(): " +
1986+ "failed to exec command. [%s]" % (exec_cmd))
1987+ pm_log.debug("exec_outside_cmd(): %s" % (strerror))
1988+ return None, None
1989+
1990+ # Check return status.
1991+ if os.WIFEXITED(status) == False:
1992+ pm_log.error("exec_outside_cmd(): " +
1993+ "command [%s] exited abnormally. (status=%s)" %
1994+ (exec_cmd, status))
1995+ return None, None
1996+ rc = os.WEXITSTATUS(status)
1997+ if checkrc == True and rc != 0:
1998+ pm_log.warn("exec_outside_cmd(): " +
1999+ "command [%s] returns error. (rc=%s, msg=\"%s\")" %
2000+ (exec_cmd, rc, output))
2001+ return None, None
2002+ return rc, output
2003+
2004+ '''
2005+ Compare specified attribute's value with specified value.
2006+ Operations to compare is [lt|gt|le|ge|eq|ne].
2007+ arg1 : target attribute name.
2008+ arg2 : operation to compare.
2009+ arg3 : the value to compare with current attribute value.
2010+ arg4 : node name which has the attribute.
2011+ return : (result_of_comparision, current_attr_val)
2012+ result_of_comparision:
2013+ True -> matched.
2014+ False -> not matched.
2015+ None -> error occurs or attribute doesn't exist.
2016+ '''
2017+ def check_attribute(self, attrname, op, attrval, node):
2018+
2019+ # Execute command.
2020+ options = ("-G -U %s -t status -n %s" % (node, attrname))
2021+ (status, output) = \
2022+ self.exec_outside_cmd(CMD_CRM_ATTR, options, False)
2023+ if status == None:
2024+ # Failed to exec command, or
2025+ # The node is dead, or
2026+ # Specified attribute doesn't exist.
2027+ pm_log.warn("check_attribute(): " +
2028+ "failed to get %s's value." % (attrname))
2029+ return None, None
2030+
2031+ pm_log.debug("check_attribute(): " +
2032+ "%s's status[%s] output[%s] node[%s] attr[%s]" %
2033+ (CMD_CRM_ATTR, status, output, node, attrname))
2034+
2035+ if status != 0:
2036+ # crm_attribute returns error value.
2037+ # Maybe local node is shutting down.
2038+ return None, None
2039+ # In normal case, crm_attribute command shows like the following.
2040+ # " name=default_ping_set value=100"
2041+ # So parse it to get current attribute value.
2042+ try:
2043+ valuepos = output.index('value=')
2044+ currentval = output[valuepos + len('value='):].strip()
2045+ result = getattr(operator, op)(currentval, attrval)
2046+ except:
2047+ pm_log.error("check_attribute(): " +
2048+ "failed to comparison %s's value. " % (attrname) +
2049+ "(currentval=%s, op=%s, specifiedval=%s)" %
2050+ (currentval, op, attrval))
2051+ return None, None
2052+ return result, currentval
2053+
2054+ '''
2055+ Check the specified node is ping node or not.
2056+ To get ping node information, parse ha.cf.
2057+ arg1 : target node name.
2058+ return : True -> the node is ping node.
2059+ False -> the node is not ping node.
2060+ None -> error occurs.
2061+ '''
2062+ def is_pingnode(self, nodename):
2063+ pingnodeList = list()
2064+ # parse ha.cf to get ping nodes.
2065+ try:
2066+ if os.access(HACFFILE, os.F_OK | os.R_OK) == False:
2067+ pm_log.error("is_pingnode(): " +
2068+ "failed to read ha.cf file. [%s]" % (HACFFILE))
2069+ return None
2070+
2071+ cf = open(HACFFILE, 'r')
2072+ for line in cf:
2073+ wordList = line.split()
2074+ if len(wordList) < 1:
2075+ # Ignore empty line.
2076+ continue
2077+ if wordList[0] == "ping":
2078+ pingnodeList.extend(wordList[1:])
2079+ elif wordList[0] == "ping_group":
2080+ pingnodeList.extend(wordList[2:])
2081+ else:
2082+ pass
2083+ cf.close()
2084+ except:
2085+ pm_log.error("is_pingnode(): " +
2086+ "failed to parse ha.cf file. [%s]" % (HACFFILE))
2087+ return None
2088+
2089+ if nodename in pingnodeList:
2090+ return True
2091+
2092+ return False
2093+
2094+ '''
2095+ Get online node from command.
2096+ return : active node in the cluster.
2097+ None -> error occurs.
2098+ '''
2099+ def get_onlinenode(self):
2100+ onlineset = set()
2101+ ret = self.is_heartbeat()
2102+ if ret == None:
2103+ return ret
2104+ elif ret == False:
2105+ return onlineset
2106+ options = ("-p")
2107+ (status, nodelist) = self.exec_outside_cmd(CMD_CRM_NODE, options, False)
2108+ if status == None:
2109+ # Failed to exec command.
2110+ pm_log.warn("get_onlinenode(): failed to get active nodelist.")
2111+ return None
2112+
2113+ for nodename in nodelist.split():
2114+ options = ("-N %s -n standby -G -l forever -d off" % (nodename))
2115+ (status, output) = self.exec_outside_cmd(CMD_CRM_ATTR, options, False)
2116+ if status == None:
2117+ # Failed to exec command.
2118+ pm_log.warn("get_onlinenode(): failed to get online nodelist.")
2119+ return None
2120+ standby = output[output.index("value"):]
2121+ if standby.split("=")[1] == "off":
2122+ onlineset.add(nodename)
2123+ pm_log.debug("get_onlinenode(): node %s is online node." % (list(onlineset)))
2124+ return onlineset
2125+
2126+ '''
2127+ Set specified values to RscStat object list.
2128+ If the same rscid is already in the list, update the elements' value.
2129+ If not, append the new RscStat object to the list.
2130+ When the arg's value is None, don't update the element's value.
2131+
2132+ arg1 : resource id.
2133+ arg2 : the rsc's status. [Started on node|Stopped]
2134+ arg3 : the rsc's F/O failed or not. (depends on "cannot run anywhere")
2135+ arg4 : the rsc is managed or not.
2136+ return Nothing.
2137+ '''
2138+ def set_rscstat(self, rscid, statstr, fofailed, unmanaged):
2139+ newrsc = RscStat(rscid,statstr,fofailed,unmanaged)
2140+ if newrsc in self.rscstatList:
2141+ idx = self.rscstatList.index(newrsc)
2142+ self.rscstatList[idx].replace(newrsc)
2143+ else:
2144+ self.rscstatList.append(newrsc)
2145+
2146+ '''
2147+ Debug print for ConvertStatus (exclude ino and offset).
2148+ '''
2149+ def debug_status(self):
2150+ pm_log.debug("debug_status(): FAIL[%s], IN_CALC[%s], "\
2151+ "RSC_MOVE[%s], IN_FO[%s], Rscop%s, Node%s" %
2152+ (cstat.FAILURE_OCCURRED, cstat.IN_CALC,
2153+ cstat.ACTRSC_MOVE, cstat.IN_FO_PROCESS,
2154+ list(cstat.timedoutRscopSet), list(cstat.shutNodeSet)))
2155+
2156+ '''
2157+ Clear ConvertStatus (exclude ino and offset).
2158+ '''
2159+ def clear_status(self):
2160+ pm_log.debug("clear_status():" +
2161+ "clear convert status (exclude ino and offset).")
2162+ self.debug_status()
2163+ cstat.FAILURE_OCCURRED = False
2164+ cstat.IN_CALC = False
2165+ cstat.ACTRSC_MOVE = False
2166+ cstat.IN_FO_PROCESS = False
2167+ cstat.timedoutRscopSet = set()
2168+ cstat.shutNodeSet = set()
2169+ self.debug_status()
2170+
2171+ ##########
2172+ # General-purpose functions.
2173+ ##########
2174+ '''
2175+ Output original ha-log message.
2176+ '''
2177+ def output_original_log(self, outputobj, logelm, lconvfrm):
2178+ # Output original log message
2179+ outputobj.output_log(lconvfrm.loglevel, None)
2180+ return CONV_OK
2181+
2182+ '''
2183+ Output static message.
2184+ This function just outputs section name.
2185+ '''
2186+ def output_static_msg(self, outputobj, logelm, lconvfrm):
2187+ # Output rulename (= section name).
2188+ outputobj.output_log(lconvfrm.loglevel, lconvfrm.rulename)
2189+ return CONV_OK
2190+
2191+ ##########
2192+ # For Resource event.
2193+ ##########
2194+ '''
2195+ Convert log message which means HB tries to operate.
2196+ This function is common for OCF resource's start, stop, promote, demote
2197+ and STONITH resource's start, stop.
2198+ NOTE: monitor operation is not a target.
2199+
2200+ MsgNo.1-1)
2201+ Jan 6 14:16:27 x3650a crmd: [9874]: info: do_lrm_rsc_op: Performing key=17:2:0:dae9d86d-9c4b-44f2-822c-b559db044ba2 op=prmApPostgreSQLDB_start_0 )
2202+ MsgNo.2-1)
2203+ Jan 6 15:05:00 x3650a crmd: [9874]: info: do_lrm_rsc_op: Performing key=20:7:0:dae9d86d-9c4b-44f2-822c-b559db044ba2 op=prmApPostgreSQLDB_stop_0 )
2204+ MsgNo.4-1)
2205+ Jan 12 18:34:51 x3650a crmd: [15901]: info: do_lrm_rsc_op: Performing key=32:13:0:9d68ec4b-527f-4dda-88b3-9203fef16f56 op=prmStateful:1_promote_0 )
2206+ MsgNo.5-1)
2207+ Jan 12 18:34:49 x3650a crmd: [3464]: info: do_lrm_rsc_op: Performing key=35:11:0:9d68ec4b-527f-4dda-88b3-9203fef16f56 op=prmStateful:0_demote_0 )
2208+ MsgNo.17-1)
2209+ Jan 7 10:21:41 x3650a crmd: [25493]: info: do_lrm_rsc_op: Performing key=35:1:0:683d57a3-6623-46ae-bbc9-6b7930aec9c2 op=prmStonith2-3_start_0 )
2210+ MsgNo.18-1)
2211+ Jan 7 10:22:11 x3650a crmd: [25493]: info: do_lrm_rsc_op: Performing key=30:5:0:683d57a3-6623-46ae-bbc9-6b7930aec9c2 op=prmStonith2-3_stop_0 )
2212+ '''
2213+ def try_to_operate(self, outputobj, logelm, lconvfrm):
2214+ try:
2215+ # In the case of example above, tmp's value is
2216+ # "op=master_slave_Stateful0:1_promote_0".
2217+ tmp = logelm.halogmsg.split()[3]
2218+ # remove "op=" at the head.
2219+ opid = tmp[3:]
2220+ rscid, op = self.parse_opid(opid)[:2]
2221+ except:
2222+ return CONV_PARSE_ERROR
2223+ if self.is_empty(rscid, op):
2224+ return CONV_ITEM_EMPTY
2225+
2226+ convertedlog = ("Resource %s tries to %s." % (rscid, op))
2227+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2228+ return CONV_OK
2229+
2230+ '''
2231+ Convert log message which means HB succeeded in operation.
2232+ This function is common for OCF resource's start, stop, promote, demote
2233+ and STONITH resource's start, stop.
2234+ NOTE: monitor operation is not a target.
2235+
2236+ MsgNo.1-2)
2237+ Jan 6 14:16:28 x3650a crmd: [9874]: info: process_lrm_event: LRM operation prmApPostgreSQLDB_start_0 (call=25, rc=0, cib-update=69, confirmed=true) ok
2238+ MsgNo.2-2)
2239+ Jan 6 15:05:01 x3650a crmd: [9874]: info: process_lrm_event: LRM operation prmApPostgreSQLDB_stop_0 (call=27, rc=0, cib-update=79, confirmed=true) ok
2240+ MsgNo.4-2)
2241+ Jan 12 18:34:51 x3650a crmd: [15901]: info: process_lrm_event: LRM operation prmStateful:1_promote_0 (call=18, rc=0, cib-update=27, confirmed=true) ok
2242+ MsgNo.5-2)
2243+ Jan 12 18:34:49 x3650a crmd: [3464]: info: process_lrm_event: LRM operation prmStateful:0_demote_0 (call=37, rc=0, cib-update=79, confirmed=true) ok
2244+ MsgNo.17-2)
2245+ Jan 7 10:21:41 x3650a crmd: [25493]: info: process_lrm_event: LRM operation prmStonith2-3_start_0 (call=11, rc=0, cib-update=42, confirmed=true) ok
2246+ MsgNo.18-2)
2247+ Jan 7 10:22:11 x3650a crmd: [25493]: info: process_lrm_event: LRM operation prmStonith2-3_stop_0 (call=34, rc=0, cib-update=71, confirmed=true) ok
2248+ '''
2249+ def operation_succeeded(self, outputobj, logelm, lconvfrm):
2250+ completeopDic = {
2251+ 'start' : 'started',
2252+ 'stop' : 'stopped',
2253+ 'promote': 'promoted',
2254+ 'demote' : 'demoted'
2255+ }
2256+ try:
2257+ wordlist = logelm.halogmsg.split()
2258+ rscid, op = self.parse_opid(wordlist[3])[:2]
2259+ rcstr = self.trimmark(wordlist[5],"=")
2260+ except:
2261+ return CONV_PARSE_ERROR
2262+ if self.is_empty(rscid, op, rcstr):
2263+ return CONV_ITEM_EMPTY
2264+
2265+ if op in completeopDic.keys():
2266+ opstr = completeopDic[op]
2267+ else:
2268+ #Just in case. It shuoldn't occur unless cf file is modified.
2269+ opstr = ("%s ok" % (op))
2270+ convertedlog = ("Resource %s %s. (%s)" % (rscid, opstr, rcstr))
2271+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2272+ return CONV_OK
2273+
2274+ '''
2275+ Convert log message which means HB failed to do the operation.
2276+ This function is common for OCF resource's start, stop,
2277+ monitor (exclude rc=OCF_NOT_RUNNING), promote, demote,
2278+ and STONITH resource's start, stop.
2279+ MsgNo.1-3)
2280+ Jan 6 15:22:45 x3650a crmd: [26989]: info: process_lrm_event: LRM operation prmApPostgreSQLDB_start_0 (call=25, rc=1, cib-update=58, confirmed=true) unknown error
2281+ MsgNo.2-3)
2282+ Jan 6 18:11:34 x3650a crmd: [4144]: info: process_lrm_event: LRM operation prmApPostgreSQLDB_stop_0 (call=27, rc=1, cib-update=76, confirmed=true) unknown error
2283+ MsgNo.3-1)
2284+ Jan 6 19:23:01 x3650a crmd: [19038]: info: process_lrm_event: LRM operation prmExPostgreSQLDB_monitor_10000 (call=16, rc=1, cib-update=72, confirmed=false) unknown error
2285+ MsgNo.4-3)
2286+ Jan 6 15:22:45 x3650a crmd: [26989]: info: process_lrm_event: LRM operation prmStateful:1_promote_0 (call=25, rc=1, cib-update=58, confirmed=true) unknown error
2287+ MsgNo.5-3)
2288+ Jan 6 15:22:45 x3650a crmd: [26989]: info: process_lrm_event: LRM operation prmStateful:1_demote_0 (call=25, rc=1, cib-update=58, confirmed=true) unknown error
2289+ MsgNo.17-3)
2290+ Jan 7 10:54:45 x3650a crmd: [32714]: info: process_lrm_event: LRM operation prmStonith2-3_start_0 (call=11, rc=1, cib-update=56, confirmed=true) unknown error
2291+ MsgNo.19-1)
2292+ Jan 7 13:47:57 x3650a crmd: [19263]: info: process_lrm_event: LRM operation prmStonith2-3_monitor_30000 (call=30, rc=14, cib-update=89, confirmed=false) status: unknown
2293+ '''
2294+ def operation_failed(self, outputobj, logelm, lconvfrm):
2295+ try:
2296+ wordlist = logelm.halogmsg.split()
2297+ rscid, op = self.parse_opid(wordlist[3])[:2]
2298+ rcstr = self.trimmark(wordlist[5],"=")
2299+ except:
2300+ return CONV_PARSE_ERROR
2301+ if self.is_empty(rscid, op, rcstr):
2302+ return CONV_ITEM_EMPTY
2303+
2304+ # If lrmd detected this operation's timeout, treated this log as
2305+ # resource operation timed out.
2306+ # It's for STONITH [start|stop|monitor] operation.
2307+ convertedlog = ("Resource %s failed to %s." % (rscid, op))
2308+ rscid_and_op = (rscid + ":" + op)
2309+ if rscid_and_op in cstat.timedoutRscopSet:
2310+ convertedlog = ("%s (Timed Out)" % (convertedlog))
2311+ cstat.timedoutRscopSet.discard(rscid_and_op)
2312+ else:
2313+ convertedlog = ("%s (%s)" % (convertedlog, rcstr))
2314+
2315+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2316+ return CONV_OK
2317+
2318+ '''
2319+ Convert log message which means operation for OCF resource timed out.
2320+ This function is common for start, stop, monitor, promote, demote.
2321+ MsgNo.1-4)
2322+ Jan 6 17:41:35 x3650a crmd: [1404]: ERROR: process_lrm_event: LRM operation prmApPostgreSQLDB_start_0 (25) Timed Out (timeout=30000ms)
2323+ MsgNo.2-4)
2324+ Jan 6 18:19:47 x3650a crmd: [7948]: ERROR: process_lrm_event: LRM operation prmApPostgreSQLDB_stop_0 (27) Timed Out (timeout=30000ms)
2325+ MsgNo.3-3)
2326+ Jan 6 19:55:31 x3650a crmd: [28183]: ERROR: process_lrm_event: LRM operation prmExPostgreSQLDB_monitor_10000 (27) Timed Out (timeout=30000ms)
2327+ MsgNo.4-4)
2328+ Jan 6 17:41:35 x3650a crmd: [1404]: ERROR: process_lrm_event: LRM operation prmStateful:1_promote_0 (25) Timed Out (timeout=30000ms)
2329+ MsgNo.5-4)
2330+ Jan 6 17:41:35 x3650a crmd: [1404]: ERROR: process_lrm_event: LRM operation prmStateful:1_demote_0 (25) Timed Out (timeout=30000ms)
2331+ '''
2332+ def operation_timedout_ocf(self, outputobj, logelm, lconvfrm):
2333+ try:
2334+ opid = logelm.halogmsg.split()[3]
2335+ rscid, op = self.parse_opid(opid)[:2]
2336+ except:
2337+ return CONV_PARSE_ERROR
2338+ if self.is_empty(rscid, op):
2339+ return CONV_ITEM_EMPTY
2340+
2341+ # remove from timed out rscop list.
2342+ # Because it became clear that the operation timed out.
2343+ rscid_and_op = ("%s:%s" % (rscid, op))
2344+ cstat.timedoutRscopSet.discard(rscid_and_op)
2345+
2346+ convertedlog = ("Resource %s failed to %s. (Timed Out)" % (rscid, op))
2347+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2348+ return CONV_OK
2349+
2350+ '''
2351+ Convert log message which means resource is not running.
2352+ This function is only for OCF and STONITH resource's monitor
2353+ (rc=OCF_NOT_RUNNING).
2354+
2355+ MsgNo.3-2)
2356+ Jan 6 19:45:58 x3650a crmd: [23987]: info: process_lrm_event: LRM operation prmExPostgreSQLDB_monitor_10000 (call=16, rc=7, cib-update=60, confirmed=false) not running
2357+ MsgNo.19-2)
2358+ Jan 7 13:47:57 x3650a crmd: [19263]: info: process_lrm_event: LRM operation prmStonith2-3_monitor_30000 (call=30, rc=14, cib-update=89, confirmed=false) status: unknown
2359+ '''
2360+ def detect_rsc_failure(self, outputobj, logelm, lconvfrm):
2361+ try:
2362+ wordlist = logelm.halogmsg.split()
2363+ rscid = self.parse_opid(wordlist[3])[0]
2364+ rcstr = self.trimmark(wordlist[5],"=")
2365+ except:
2366+ return CONV_PARSE_ERROR
2367+ if self.is_empty(rscid, rcstr):
2368+ return CONV_ITEM_EMPTY
2369+
2370+ convertedlog = ("Resource %s does not work. (%s)" % (rscid, rcstr))
2371+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2372+ return CONV_OK
2373+
2374+ #########
2375+ # For Node status event.
2376+ #########
2377+ '''
2378+ Convert log message which means Node status updated.
2379+
2380+ MsgNo.6-1)
2381+ Jul 16 14:07:57 x3650a crmd: [7361]: notice: crmd_ha_status_callback: Status update: Node x3650b now has status [dead] (DC=true)
2382+ MsgNo.6-2)
2383+ Jul 16 13:41:04 x3650a crmd: [2114]: notice: crmd_ha_status_callback: Status update: Node x3650b now has status [active] (DC=true)
2384+ '''
2385+ def node_status_updated(self, outputobj, logelm, lconvfrm):
2386+ try:
2387+ wordList = logelm.halogmsg.split()
2388+ nodename = wordList[4]
2389+ status = wordList[8].lstrip('[').rstrip(']')
2390+ except:
2391+ return CONV_PARSE_ERROR
2392+ if self.is_empty(nodename, status):
2393+ return CONV_ITEM_EMPTY
2394+
2395+ ret = self.is_pingnode(nodename)
2396+ if ret == True:
2397+ #Ignore the network status's change.
2398+ return CONV_OK
2399+ elif ret == None:
2400+ return CONV_GETINFO_ERROR
2401+
2402+ # It's node status's change.
2403+ output_loglevel = self.LOG_INFO_LV
2404+ if status == "dead":
2405+ output_loglevel = self.LOG_WARN_LV
2406+ status = "lost"
2407+ elif status == "active":
2408+ if nodename in cstat.shutNodeSet:
2409+ cstat.shutNodeSet.discard(nodename)
2410+ status = "member"
2411+
2412+ convertedlog = ("Node %s is %s." % (nodename, status))
2413+ outputobj.output_log(output_loglevel, convertedlog)
2414+ return CONV_OK
2415+
2416+ ##########
2417+ # For Interconnect-LAN status event and
2418+ # Network status event (detected by pingd).
2419+ ##########
2420+ '''
2421+ Convert log message which means Interconnect-LAN status changed to "dead"
2422+
2423+ MsgNo.7-1)
2424+ Jul 15 11:27:46 x3650a heartbeat: [17442]: info: Link x3650b:eth2 dead.
2425+ '''
2426+ def detect_iconnlan_dead(self, outputobj, logelm, lconvfrm):
2427+ try:
2428+ wordlist = logelm.halogmsg.split()
2429+ nodename, linkname = wordlist[1].split(':')
2430+ except:
2431+ return CONV_PARSE_ERROR
2432+ if self.is_empty(nodename):
2433+ return CONV_ITEM_EMPTY
2434+
2435+ ret = self.is_pingnode(nodename)
2436+ if ret == True:
2437+ #Ignore the network failure.
2438+ return CONV_OK
2439+ elif ret == False:
2440+ convertedlog = ("Link %s:%s is FAULTY." % (nodename, linkname))
2441+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2442+ return CONV_OK
2443+ else:
2444+ return CONV_GETINFO_ERROR
2445+
2446+ '''
2447+ Convert log message which means network status changed to "up".
2448+ The same log appears when Interconnect-LAN's event occurs and
2449+ Ping node's one.
2450+
2451+ MsgNo.7-2)
2452+ Jul 15 11:12:14 x3650a heartbeat: [17442]: info: Link x3650b:eth2 up.
2453+ '''
2454+ def detect_network_up(self, outputobj, logelm, lconvfrm):
2455+ try:
2456+ wordlist = logelm.halogmsg.split()
2457+ nodename, linkname = wordlist[1].split(':')
2458+ except:
2459+ return CONV_PARSE_ERROR
2460+ if self.is_empty(nodename, linkname):
2461+ return CONV_ITEM_EMPTY
2462+
2463+ ret = self.is_pingnode(nodename)
2464+ if ret == True:
2465+ return CONV_OK
2466+ elif ret == False:
2467+ convertedlog = ("Link %s:%s is up." % (nodename, linkname))
2468+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2469+ return CONV_OK
2470+ else:
2471+ return CONV_GETINFO_ERROR
2472+
2473+ '''
2474+ Convert log message which means Network to ping node status changed
2475+ to "dead"
2476+ See also the comment on detect_iconnlan_dead().
2477+
2478+ MsgNo.8-1)
2479+ Jan 13 16:24:13 x3650a pingd: [8849]: info: stand_alone_ping: Node 192.168.201.254 is unreachable (write)
2480+ Jan 28 12:51:51 x3650a pingd: [16908]: info: stand_alone_ping: Node 192.168.201.254 is unreachable (read)
2481+ '''
2482+ def detect_node_dead(self, outputobj, logelm, lconvfrm):
2483+ try:
2484+ nodename = logelm.halogmsg.split()[2]
2485+ except:
2486+ return CONV_PARSE_ERROR
2487+ if self.is_empty(nodename):
2488+ return CONV_ITEM_EMPTY
2489+
2490+ convertedlog = ("Network to %s is unreachable." % (nodename))
2491+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2492+ return CONV_OK
2493+
2494+ ##########
2495+ # For Disk status event (detected by diskd).
2496+ ##########
2497+ '''
2498+ Convert log message which means disk error.
2499+
2500+ MsgNo.9-1)
2501+ Jun 24 20:19:53 x3650a diskd: [22126]: WARN: check_status: disk status is changed, attr_name=diskcheck_status_internal, target=/tmp, new_status=ERROR
2502+ '''
2503+ def detect_disk_error(self, outputobj, logelm, lconvfrm):
2504+ try:
2505+ wordlist = logelm.halogmsg.split(',')
2506+ attrname = wordlist[1].split('=')[1]
2507+ target = wordlist[2].split('=')[1]
2508+ status = wordlist[3].split('=')[1]
2509+ except:
2510+ return CONV_PARSE_ERROR
2511+ if self.is_empty(attrname, target, status):
2512+ return CONV_ITEM_EMPTY
2513+
2514+ convertedlog = ("Disk connection to %s is %s. (attr_name=%s)" % (target, status, attrname))
2515+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2516+ return CONV_OK
2517+
2518+ #########
2519+ # For respawn process event.
2520+ #########
2521+ '''
2522+ Convert log message which means respawn process start.
2523+
2524+ MsgNo.10-1)
2525+ Jul 27 17:29:52 x3650a heartbeat: [25800]: info: Starting "/usr/lib64/heartbeat/attrd" as uid 500 gid 501 (pid 25800)
2526+ '''
2527+ def respawn_start(self, outputobj, logelm, lconvfrm):
2528+ try:
2529+ keyword="Starting "
2530+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2531+ end_pos = logelm.halogmsg.rindex("as uid")
2532+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0].strip("\"")
2533+ leftwordList = logelm.halogmsg[end_pos:].split()
2534+ pid = leftwordList[-1].split(')')[0]
2535+ except:
2536+ return CONV_PARSE_ERROR
2537+ if self.is_empty(procname, pid):
2538+ return CONV_ITEM_EMPTY
2539+
2540+ convertedlog = ("Start \"%s\" process. (pid=%s)" % (procname, pid))
2541+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2542+ return CONV_OK
2543+
2544+ '''
2545+ Convert log message which means respawn process exited with error.
2546+
2547+ MsgNo.10-2)
2548+ Jul 20 15:47:47 x3650a heartbeat: [21753]: info: Managed /usr/lib64/heartbeat/attrd process 30930 exited with return code 0.
2549+ '''
2550+ def respawn_exited_abnormally(self, outputobj, logelm, lconvfrm):
2551+ try:
2552+ keyword="Managed "
2553+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2554+ end_pos = logelm.halogmsg.rindex("process")
2555+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2556+ leftwordList = logelm.halogmsg[end_pos:].split()
2557+ pid = leftwordList[1]
2558+ exitcode = leftwordList[6].rstrip(".")
2559+ except:
2560+ return CONV_PARSE_ERROR
2561+ if self.is_empty(procname, pid, exitcode):
2562+ return CONV_ITEM_EMPTY
2563+
2564+ convertedlog = ("Managed \"%s\" process exited. (pid=%s, rc=%s)" % (procname, pid, exitcode))
2565+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2566+ return CONV_OK
2567+
2568+ '''
2569+ Convert log message which means respawn process killed by signal.
2570+
2571+ MsgNo.10-3)
2572+ Jul 20 15:46:43 x3650a heartbeat: [21753]: WARN: Managed /usr/lib64/heartbeat/attrd process 21772 killed by signal 9 [SIGKILL - Kill, unblockable].
2573+ '''
2574+ def respawn_killed(self, outputobj, logelm, lconvfrm):
2575+ try:
2576+ keyword="Managed "
2577+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2578+ end_pos = logelm.halogmsg.rindex("process")
2579+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2580+ leftwordList = logelm.halogmsg[end_pos:].split()
2581+ pid = leftwordList[1]
2582+ signum = leftwordList[5].rstrip('.')
2583+ except:
2584+ return CONV_PARSE_ERROR
2585+ if self.is_empty(procname, pid, signum):
2586+ return CONV_ITEM_EMPTY
2587+
2588+ convertedlog = ("Managed \"%s\" process terminated with signal %s. (pid=%s)" % (procname, signum, pid))
2589+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2590+ return CONV_OK
2591+
2592+ '''
2593+ Convert log message which means respawn process dumped core.
2594+
2595+ MsgNo.10-4)
2596+ Jul 20 17:08:38 x3650a heartbeat: [6154]: ERROR: Managed /usr/lib64/heartbeat/attrd process 6173 dumped core
2597+ '''
2598+ def respawn_dumped_core(self, outputobj, logelm, lconvfrm):
2599+ try:
2600+ keyword="Managed "
2601+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2602+ end_pos = logelm.halogmsg.rindex("process")
2603+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2604+ pid = logelm.halogmsg[end_pos:].split()[1]
2605+ except:
2606+ return CONV_PARSE_ERROR
2607+ if self.is_empty(procname, pid):
2608+ return CONV_ITEM_EMPTY
2609+
2610+ convertedlog = ("Managed \"%s\" process dumped core. (pid=%s)" % (procname, pid))
2611+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2612+ return CONV_OK
2613+
2614+ '''
2615+ Convert log message which means respawn process went away strangely.
2616+
2617+ MsgNo.10-5)
2618+ Jul 27 17:30:34 x3650a heartbeat: [25793]: ERROR: Managed /usr/lib64/heartbeat/attrd process 6173 went away strangely (!)
2619+ '''
2620+ def respawn_went_away(self, outputobj, logelm, lconvfrm):
2621+ try:
2622+ keyword="Managed "
2623+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2624+ end_pos = logelm.halogmsg.rindex("process")
2625+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2626+ pid = logelm.halogmsg[end_pos:].split()[1]
2627+ except:
2628+ return CONV_PARSE_ERROR
2629+ if self.is_empty(procname, pid):
2630+ return CONV_ITEM_EMPTY
2631+
2632+ convertedlog = ("Managed \"%s\" process went away strangely. (pid=%s)" % (procname, pid))
2633+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2634+ return CONV_OK
2635+
2636+ '''
2637+ Convert log message which means respawn process exited normally in shutdown process.
2638+
2639+ MsgNo.10-6)
2640+ Jul 27 17:30:34 x3650a heartbeat: [25793]: info: killing /usr/lib64/heartbeat/attrd process group 25803 with signal 15
2641+ '''
2642+ def respawn_exited_normally(self, outputobj, logelm, lconvfrm):
2643+ try:
2644+ keyword="killing "
2645+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2646+ end_pos = logelm.halogmsg.rindex("process")
2647+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2648+ leftwordList = logelm.halogmsg[end_pos:].split()
2649+ pgid = leftwordList[2]
2650+ except:
2651+ return CONV_PARSE_ERROR
2652+ if self.is_empty(procname, pgid):
2653+ return CONV_ITEM_EMPTY
2654+
2655+ convertedlog = ("Stop \"%s\" process normally. (pid=%s)" % (procname, pgid))
2656+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2657+ return CONV_OK
2658+
2659+ '''
2660+ Convert log message which means do respawning too frequently in a short term.
2661+
2662+ MsgNo.10-7)
2663+ Jul 27 17:23:40 x3650a heartbeat: [23265]: ERROR: Client /usr/lib64/heartbeat/attrd "respawning too fast"
2664+ '''
2665+ def respawn_too_fast(self, outputobj, logelm, lconvfrm):
2666+ try:
2667+ keyword="Client "
2668+ start_pos = logelm.halogmsg.index(keyword) + len(keyword)
2669+ end_pos = logelm.halogmsg.rindex("respawning") - 2
2670+ procname = logelm.halogmsg[start_pos:end_pos].strip().split('/')[-1].split()[0]
2671+ except:
2672+ return CONV_PARSE_ERROR
2673+ if self.is_empty(procname):
2674+ return CONV_ITEM_EMPTY
2675+
2676+ convertedlog = ("Respawn count exceeded by \"%s\"." % (procname))
2677+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
2678+ return CONV_OK
2679+
2680+ ##########
2681+ # For Fail Over.
2682+ ##########
2683+ '''
2684+ Output the log which tells F/O starts.
2685+ '''
2686+ def detect_fo_start(self, outputobj):
2687+ self.debug_status()
2688+ if cstat.IN_FO_PROCESS == True:
2689+ return
2690+ cstat.IN_FO_PROCESS = True
2691+ convertedlog = ("Start to fail-over.")
2692+ outputobj.output_log(self.LOG_ERR_LV, convertedlog)
2693+ self.debug_status()
2694+ return
2695+
2696+ '''
2697+ Detect pengine starts the calculation for transition.
2698+ This function is called when cluster status became "S_POLICY_ENGINE"
2699+ and input data is not I_SHUTDOWN (do shutdown process).
2700+ It considers a failure occurred when specified attributes are
2701+ updated to abnormal value.
2702+ When the failure occurred, this function outputs the log to tell it.
2703+ If not or it is already in F/O process, it outputs nothing.
2704+
2705+ MsgNo.F0-1, F9-1, F10-1)
2706+ Jan 5 15:19:20 x3650a crmd: [17659]: info: do_state_transition: State transition S_IDLE -> S_POLICY_ENGINE [ input=I_PE_CALC cause=C_FSA_INTERNAL origin=abort_transition_graph ]
2707+ '''
2708+ def detect_pe_calc(self, outputobj, logelm, lconvfrm):
2709+ cstat.IN_CALC = True
2710+
2711+ # Initialize resource status list.
2712+ # See the comment on detect_rsc_unmanaged().
2713+ self.rscstatList = None
2714+ self.rscstatList = list()
2715+
2716+ # If any failure didn't occur and Heartbeat is not in shutdown process,
2717+ # and the node on localhost is not in shutting down,
2718+ # check each attribute's value to decide whether it is F/O or not.
2719+ if cstat.FAILURE_OCCURRED == False and HOSTNAME not in cstat.shutNodeSet:
2720+ nodeset = self.get_onlinenode()
2721+ if nodeset == None:
2722+ return CONV_GETINFO_ERROR
2723+ for node in (nodeset - cstat.shutNodeSet):
2724+ # Check each attribute's value.
2725+ for attrRule in attrRuleList:
2726+ attrname, op, attrval = tuple(attrRule)
2727+ # Check attribute's value for each node.
2728+ # Now, the node seems to be active.
2729+ result = self.check_attribute(attrname, op, attrval, node)[0]
2730+ if result == True:
2731+ # attribute's value means "failure(s) occurred"!
2732+ cstat.FAILURE_OCCURRED = FAIL_SCORE
2733+ if cstat.ACTRSC_MOVE == FAIL_MOVE or \
2734+ cstat.ACTRSC_MOVE == FAIL_STP:
2735+ self.detect_fo_start(outputobj)
2736+ # [COMMENT]
2737+ # result == False:
2738+ # attribute did not change or
2739+ # it was updated to normal value.
2740+ # result == None:
2741+ # some errors occurred in check_attribute() or
2742+ # the node is not running or
2743+ # specified attribute does not exist.
2744+ return CONV_OK
2745+
2746+ '''
2747+ Output the log which tells F/O finished.
2748+ In addition, output all resources' status.
2749+ It considers that F/O succeeded when all of specified resources
2750+ (with the parameter OPT_ACTRSC in config file) are running,
2751+ and if any resource at all stops, it considers F/O failed.
2752+ This function is called when cluster status became "S_IDLE".
2753+
2754+ MsgNo.F0-2, F12-1, F12-2)
2755+ Jan 5 14:50:07 x3650a crmd: [13198]: info: do_state_transition: State transition S_TRANSITION_ENGINE -> S_IDLE [ input=I_TE_SUCCESS cause=C_FSA_INTERNAL origin=notify_crmd ]
2756+ '''
2757+ def detect_fo_complete(self, outputobj, logelm, lconvfrm):
2758+
2759+ # Check specified resources exist in this cluster.
2760+ if len(self.rscstatList) > 0:
2761+ for actrsc in actRscList:
2762+ newrsc = RscStat(actrsc)
2763+ if newrsc not in self.rscstatList:
2764+ pm_log.error("detect_fo_complete(): " +
2765+ "resource [%s] is not in this cluster." % (actrsc))
2766+ break
2767+
2768+ if cstat.IN_FO_PROCESS == False:
2769+ self.clear_status()
2770+ return CONV_OK
2771+ self.clear_status()
2772+
2773+ # When one or more Unmanaged resource exists in the cluster,
2774+ # (even if the resource is not set in act_rsc)
2775+ # it is unusual state, so consider it "F/O failed".
2776+ detect_fo_failed = False
2777+ unmanaged_rsc_exists = False
2778+ for rscstat in self.rscstatList:
2779+ if rscstat.unmanaged:
2780+ convertedlog = ("Unmanaged resource exists.")
2781+ outputobj.output_log(self.LOG_ERR_LV, convertedlog)
2782+ detect_fo_failed = True
2783+ unmanaged_rsc_exists = True
2784+ break
2785+
2786+ if unmanaged_rsc_exists == False:
2787+ # Confirm each resource's status.
2788+ detect_fo_failed = False
2789+ for rscstat in self.rscstatList:
2790+ if rscstat.rscid in actRscList:
2791+ if rscstat.fofailed or rscstat.status == "Stopped" :
2792+ output_loglevel = self.LOG_ERR_LV
2793+ output_status = ("Stopped")
2794+ detect_fo_failed = True
2795+ else:
2796+ output_loglevel = self.LOG_INFO_LV
2797+ output_status = rscstat.status
2798+ convertedlog = ("Resource %s : %s" % (rscstat.rscid, output_status))
2799+ outputobj.output_log(output_loglevel, convertedlog)
2800+
2801+ if detect_fo_failed:
2802+ outputobj.output_log(self.LOG_ERR_LV, "fail-over failed.")
2803+ else:
2804+ outputobj.output_log(self.LOG_INFO_LV, "fail-over succeeded.")
2805+
2806+ return CONV_OK
2807+
2808+ '''
2809+ Node detects some failures in the cluster.
2810+ Output nothing.
2811+
2812+ MsgNo.F1-1, F1-2, F2-1, F2-2, F3-1, F3-2, F4-1, F4-2, F6-1, F6-2)
2813+ Feb 25 13:31:37 x3650a crmd: [11105]: WARN: update_failcount: Updating failcount for prmApPostgreSQLDB on x3650a after failed monitor: rc=1 (update=value++, time=1267072297)
2814+ '''
2815+ def dc_detect_failure(self, outputobj, logelm, lconvfrm):
2816+ return CONV_OK
2817+
2818+ '''
2819+ Node detects some failures in the cluster.
2820+ Output nothing.
2821+
2822+ MsgNo.F7-1, F7-2, F7-3, F7-4, F8-1)
2823+ Jul 15 13:14:59 x3650a crmd: [31869]: WARN: match_down_event: No match for shutdown action on f8d52aae-518b-4b06-b1a1-b23486f8b410
2824+ '''
2825+ def dc_detect_node_failure(self, outputobj, logelm, lconvfrm):
2826+ try:
2827+ wordlist = logelm.halogmsg.split()
2828+ nodename = self.get_nodename(wordlist[-1])
2829+ except:
2830+ return CONV_PARSE_ERROR
2831+ if self.is_empty(nodename):
2832+ return CONV_ITEM_EMPTY
2833+
2834+ if nodename in cstat.shutNodeSet:
2835+ pm_log.debug("The [%s] exists in the shutdown list." % (nodename))
2836+ pm_log.debug("Ignore the fotrigger flag setting.")
2837+ return CONV_SHUT_NODE
2838+
2839+ return CONV_OK
2840+
2841+ '''
2842+ Detect resource start action added.
2843+ This is to get resource status when F/O finished.
2844+ So it outputs nothing.
2845+
2846+ MsgNo. F11-1)
2847+ Jan 5 15:12:25 x3650a pengine: [16657]: notice: LogActions: Start prmExPostgreSQLDB (x3650a)
2848+ '''
2849+ def add_rsc_start(self, outputobj, logelm, lconvfrm):
2850+ try:
2851+ wordlist = logelm.halogmsg.split()
2852+ nodename = self.trimmark(wordlist[-1])
2853+ rscid = wordlist[2]
2854+ except:
2855+ return CONV_PARSE_ERROR
2856+ if self.is_empty(nodename, rscid):
2857+ return CONV_ITEM_EMPTY
2858+
2859+ # Set the resource's status to the list.
2860+ statstr = ("Started on %s" % (nodename))
2861+ self.set_rscstat(rscid, statstr, None, None)
2862+
2863+ if rscid in actRscList:
2864+ cstat.ACTRSC_MOVE = FAIL_STR
2865+ if cstat.FAILURE_OCCURRED == FAIL_NODE:
2866+ self.detect_fo_start(outputobj)
2867+ return CONV_OK
2868+
2869+ '''
2870+ Detect resource stop action added.
2871+ This is to get resource status when F/O finished.
2872+
2873+ MsgNo. F11-2)
2874+ Jan 5 15:19:23 x3650a pengine: [17658]: notice: LogActions: Stop resource prmExPostgreSQLDB (x3650a)
2875+ '''
2876+ def add_rsc_stop(self, outputobj, logelm, lconvfrm):
2877+ try:
2878+ wordlist = logelm.halogmsg.split()
2879+ rscid = wordlist[-2]
2880+ except:
2881+ return CONV_PARSE_ERROR
2882+ if self.is_empty(rscid):
2883+ return CONV_ITEM_EMPTY
2884+
2885+ # Set the resource's status to the list.
2886+ statstr = ("Stopped")
2887+ self.set_rscstat(rscid, statstr, None, None)
2888+
2889+ if rscid in actRscList:
2890+ cstat.ACTRSC_MOVE = FAIL_STP
2891+ if cstat.FAILURE_OCCURRED == FAIL_RSC or cstat.FAILURE_OCCURRED == FAIL_SCORE:
2892+ self.detect_fo_start(outputobj)
2893+ return CONV_OK
2894+
2895+ '''
2896+ Detect no action added for the resource.
2897+ This is to get resource status when F/O finished.
2898+ So it outputs nothing.
2899+
2900+ MsgNo.F11-3)
2901+ Jan 5 15:36:42 x3650a pengine: [27135]: notice: LogActions: Leave resource prmFsPostgreSQLDB1 (Started x3650a)
2902+ MsgNo.F11-8)
2903+ Jan 5 14:50:05 x3650a pengine: [13197]: notice: LogActions: Restart resource prmIpPostgreSQLDB (Started x3650b)
2904+ MsgNo.F11-9)
2905+ Jan 5 14:50:41 x3650a pengine: [13197]: notice: LogActions: Leave resource prmPingd:0 (Stopped)
2906+ '''
2907+ def add_no_action(self, outputobj, logelm, lconvfrm):
2908+ try:
2909+ wordlist = logelm.halogmsg.split()
2910+ rscid = wordlist[3]
2911+ status = self.trimmark(wordlist[4])
2912+ node = ""
2913+ if len(wordlist) >= 6:
2914+ node = self.trimmark(wordlist[5])
2915+ except:
2916+ return CONV_PARSE_ERROR
2917+ if self.is_empty(rscid, status):
2918+ return CONV_ITEM_EMPTY
2919+
2920+ # Set the resource's status to the list.
2921+ if node != "":
2922+ statstr = ("%s on %s" % (status, node))
2923+ else:
2924+ statstr = ("%s" % (status))
2925+ self.set_rscstat(rscid, statstr, None, None)
2926+
2927+ if statstr == "Stopped":
2928+ if rscid in actRscList:
2929+ cstat.ACTRSC_MOVE = FAIL_STPD
2930+ if cstat.FAILURE_OCCURRED == FAIL_NODE:
2931+ self.detect_fo_start(outputobj)
2932+ return CONV_OK
2933+
2934+ '''
2935+ Detect resouce cannot run anywhere.
2936+ This is to get resource status when F/O finished.
2937+ So it outputs nothing.
2938+
2939+ MsgNo. F11-4)
2940+ Jan 5 15:19:20 x3650a pengine: [17658]: WARN: native_color: Resource prmApPostgreSQLDB cannot run anywhere
2941+ '''
2942+ def detect_cannot_run_anywhere(self, outputobj, logelm, lconvfrm):
2943+ try:
2944+ wordlist = logelm.halogmsg.split()
2945+ rscid = wordlist[2]
2946+ except:
2947+ return CONV_PARSE_ERROR
2948+ if self.is_empty(rscid):
2949+ return CONV_ITEM_EMPTY
2950+
2951+ # Set the resource's status to the list.
2952+ self.set_rscstat(rscid, None, True, None)
2953+ return CONV_OK
2954+
2955+ '''
2956+ Detect resouce became unmanaged.
2957+ This is to get resource status when F/O finished.
2958+ So it outputs nothing.
2959+ When resource become *managed*, no particular log appears like
2960+ "resource A is managed", the cluster just becomes S_POLICY_ENGINE and
2961+ starts PE calcuration.
2962+ So, to clear the "unmanaged" flag in RscStat,
2963+ initialize the rscstatusList object in detect_pe_calc().
2964+
2965+ MsgNo. F11-5)
2966+ Jan 5 10:04:09 x3650a pengine: [9727]: info: native_color: Unmanaged resource prmApPostgreSQLDB allocated to 'nowhere': inactive
2967+ '''
2968+ def detect_rsc_unmanaged(self, outputobj, logelm, lconvfrm):
2969+ try:
2970+ wordlist = logelm.halogmsg.split()
2971+ rscid = wordlist[3]
2972+ except:
2973+ return CONV_PARSE_ERROR
2974+ if self.is_empty(rscid):
2975+ return CONV_ITEM_EMPTY
2976+
2977+ # Set the resource's status to the list.
2978+ self.set_rscstat(rscid, None, None, True)
2979+ return CONV_OK
2980+
2981+ '''
2982+ Detect resource move action added.
2983+ This is to get resource status when F/O started.
2984+
2985+ MsgNo. F11-6)
2986+ Jan 5 15:12:27 x3650a pengine: [16657]: notice: LogActions: Move resource prmExPostgreSQLDB (Started x3650a -> x3650b)
2987+ '''
2988+ def add_rsc_move(self, outputobj, logelm, lconvfrm):
2989+ try:
2990+ wordlist = logelm.halogmsg.split()
2991+ a_nodename = self.trimmark(wordlist[-1])
2992+ f_nodename = self.trimmark(wordlist[-3])
2993+ rscid = wordlist[3]
2994+ except:
2995+ return CONV_PARSE_ERROR
2996+
2997+ if self.is_empty(a_nodename, rscid):
2998+ return CONV_ITEM_EMPTY
2999+
3000+ # Set the resource's status to the list.
3001+ statstr = ("Move %s -> %s" % (f_nodename,a_nodename))
3002+ self.set_rscstat(rscid, statstr, None, None)
3003+
3004+ if rscid in actRscList:
3005+ cstat.ACTRSC_MOVE = FAIL_MOVE
3006+ if cstat.FAILURE_OCCURRED == FAIL_RSC or cstat.FAILURE_OCCURRED == FAIL_SCORE:
3007+ self.detect_fo_start(outputobj)
3008+
3009+ return CONV_OK
3010+
3011+ ##########
3012+ # For DC election.
3013+ ##########
3014+ '''
3015+ Convert log message which means DC election is complete.
3016+
3017+ MsgNo.13-2)
3018+ Jan 6 14:16:18 x3650a crmd: [9874]: info: update_dc: Set DC to x3650a (3.0.1)
3019+ '''
3020+ def dc_election_complete(self, outputobj, logelm, lconvfrm):
3021+ try:
3022+ nodename = logelm.halogmsg.split()[-2]
3023+ except:
3024+ return CONV_PARSE_ERROR
3025+ if self.is_empty(nodename):
3026+ return CONV_ITEM_EMPTY
3027+
3028+ convertedlog = ("Set DC node to %s." % (nodename))
3029+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3030+ return CONV_OK
3031+
3032+ '''
3033+ Convert log message which means unset DC node.
3034+
3035+ MsgNo.13-5)
3036+ Jan 12 11:22:18 x3650a crmd: [5796]: info: update_dc: Unset DC x3650a
3037+ '''
3038+ def detect_unset_dc(self, outputobj, logelm, lconvfrm):
3039+ try:
3040+ nodename = logelm.halogmsg.split()[-1]
3041+ except:
3042+ return CONV_PARSE_ERROR
3043+ if self.is_empty(nodename):
3044+ return CONV_ITEM_EMPTY
3045+
3046+ convertedlog = ("Unset DC node %s." % (nodename))
3047+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3048+ return CONV_OK
3049+
3050+ ##########
3051+ # For Pacemaker and Heartbeat service shutdown.
3052+ ##########
3053+ '''
3054+ Convert log message which means Pacemaker service on the node
3055+ in the cluster send shutdown request.
3056+
3057+ MsgNo.14-1)
3058+ Jan 18 10:35:08 x3650a crmd: [10975]: info: handle_shutdown_request: Creating shutdown request for x3650b (state=S_IDLE)
3059+ '''
3060+ def detect_shutdown_request(self, outputobj, logelm, lconvfrm):
3061+ try:
3062+ nodename = logelm.halogmsg.split()[-2]
3063+ except:
3064+ return CONV_PARSE_ERROR
3065+ if self.is_empty(nodename):
3066+ return CONV_ITEM_EMPTY
3067+
3068+ cstat.shutNodeSet.add(nodename)
3069+ convertedlog = ("Pacemaker on %s is shutting down." % (nodename))
3070+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3071+ return CONV_OK
3072+
3073+ '''
3074+ Detect Heartbeat service on localhost shutdown complete.
3075+ Output message is static, but to remove the node name from
3076+ shutting down node list, detect the message with
3077+ peculiar function.
3078+
3079+ MsgNo.14-2)
3080+ Jul 15 15:35:37 x3650a heartbeat: [16986]: info: x3650a Heartbeat shutdown complete.
3081+ '''
3082+ def detect_hb_shutdown(self, outputobj, logelm, lconvfrm):
3083+ outputobj.output_log(lconvfrm.loglevel, lconvfrm.rulename)
3084+ cstat.shutNodeSet.discard(HOSTNAME)
3085+ return CONV_OK
3086+
3087+ '''
3088+ Detect Pacemaker service on localhost starts to shutdown.
3089+ Output message is static, but to add localhost name to
3090+ shutting down node list, detect the message with
3091+ peculiar function.
3092+
3093+ MsgNo.14-3)
3094+ Jan 18 10:36:18 x3650a crmd: [12294]: info: crm_shutdown: Requesting shutdown
3095+ '''
3096+ def detect_pcmk_shutting_down(self, outputobj, logelm, lconvfrm):
3097+ cstat.shutNodeSet.add(HOSTNAME)
3098+ outputobj.output_log(lconvfrm.loglevel, lconvfrm.rulename)
3099+ return CONV_OK
3100+
3101+ '''
3102+ Convert log message which means Pacemaker service on node
3103+ send shutdown request.
3104+
3105+ MsgNo.14-4)
3106+ Jan 18 10:35:26 x3650a cib: [10971]: info: cib_process_shutdown_req: Shutdown REQ from x3650b
3107+ '''
3108+ def detect_dc_shutdown_request(self, outputobj, logelm, lconvfrm):
3109+ try:
3110+ nodename = logelm.halogmsg.split()[-1]
3111+ except:
3112+ return CONV_PARSE_ERROR
3113+ if self.is_empty(nodename):
3114+ return CONV_ITEM_EMPTY
3115+
3116+ cstat.shutNodeSet.add(nodename)
3117+ return CONV_OK
3118+
3119+ '''
3120+ Detect the send shutdown request to DC.
3121+ Add localhost name to shutting down node list.
3122+ Output nothing.
3123+
3124+ MsgNo.14-5)
3125+ Sep 16 13:11:51 x3650a crmd: [11369]: info: do_shutdown_req: Sending shutdown request to DC: x3650a
3126+ '''
3127+ def detect_send_shutdown(self, outputobj, logelm, lconvfrm):
3128+ cstat.shutNodeSet.add(HOSTNAME)
3129+ return CONV_OK
3130+
3131+ ##########
3132+ # For logging daemon event.
3133+ ##########
3134+ # use output_static_msg() only.
3135+
3136+ ##########
3137+ # For STONITH resource operation timed out.
3138+ ##########
3139+ '''
3140+ Get resource id and operation type which stonithd detected timed out.
3141+
3142+ MsgNo.17-4)
3143+ Jul 15 16:02:35 x3650a stonithd: [22087]: WARN: external_prmStonith2-2_start process (PID 22291) timed out (try 1). Killing with signal SIGTERM (15).
3144+ MsgNo.19-3)
3145+ Jan 7 14:20:16 x3650a stonithd: [14714]: WARN: external_prmStonith2-3_monitor process (PID 16383) timed out (try 1). Killing with signal SIGTERM (15).
3146+ '''
3147+ def detect_rscop_timedout_stonithd(self, outputobj, logelm, lconvfrm):
3148+ try:
3149+ tmp = logelm.halogmsg.split()[0]
3150+ wordlist = tmp.split('_')
3151+ if len(wordlist) > 2:
3152+ rscid = wordlist[1]
3153+ op = wordlist[-1]
3154+ else:
3155+ rscid = wordlist[0]
3156+ op = wordlist[-1]
3157+ except:
3158+ return CONV_PARSE_ERROR
3159+ if self.is_empty(rscid, op):
3160+ return CONV_ITEM_EMPTY
3161+
3162+ rscid_and_op = ("%s:%s" % (rscid, op))
3163+ # Append to the list.
3164+ cstat.timedoutRscopSet.add(rscid_and_op)
3165+ return CONV_OK
3166+
3167+ ##########
3168+ # For fence operation.
3169+ ##########
3170+ '''
3171+ Convert log message which means fence operation started.
3172+
3173+ MsgNo.20-1, No21-1)
3174+ Jan 13 15:23:28 x3650a stonithd: [23731]: info: stonith_operate_locally::2713: sending fencing op RESET for x3650b to prmStonith2-1 (external/ssh) (pid=23852)
3175+ '''
3176+ def fence_op_started(self, outputobj, logelm, lconvfrm):
3177+ try:
3178+ wordlist = logelm.halogmsg.split()
3179+ op = wordlist[4]
3180+ target = wordlist[6]
3181+ msg = ' '.join(wordlist[8:])
3182+ except:
3183+ return CONV_PARSE_ERROR
3184+ if self.is_empty(op, target, msg):
3185+ return CONV_ITEM_EMPTY
3186+
3187+ convertedlog = ("Try to STONITH (%s) the Node %s to %s" % (op, target, msg))
3188+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3189+ return CONV_OK
3190+
3191+ '''
3192+ Convert log message which means fence operation succeeded.
3193+
3194+ MsgNo.20-2)
3195+ Jan 13 12:51:46 x3650a stonithd: [15595]: info: Succeeded to STONITH the node x3650b: optype=RESET. whodoit: x3650a
3196+ '''
3197+ def fence_op_succeeded(self, outputobj, logelm, lconvfrm):
3198+ try:
3199+ wordlist = logelm.halogmsg.split()
3200+ target = self.trimmark(wordlist[5])
3201+
3202+ oplist = wordlist[6].split('=')
3203+ op = self.trimmark(oplist[1])
3204+
3205+ sniper = wordlist[-1]
3206+ except:
3207+ return CONV_PARSE_ERROR
3208+ if self.is_empty(target, sniper, op):
3209+ return CONV_ITEM_EMPTY
3210+
3211+ convertedlog = ("Succeeded to STONITH (%s) " % (op) + "the Node %s by Node %s." % (target, sniper))
3212+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3213+ return CONV_OK
3214+
3215+ '''
3216+ Convert log message which means fence operation failed.
3217+
3218+ MsgNo.20-3, 21-3)
3219+ Jan 13 15:48:06 x3650a stonithd: [25195]: info: failed to STONITH node x3650b with local device prmStonith2-1 (exitcode 5), gonna try the next local device
3220+ '''
3221+ def fence_op_failed(self, outputobj, logelm, lconvfrm):
3222+ try:
3223+ wordlist = logelm.halogmsg.split()
3224+ nodename = wordlist[4]
3225+ exitcode = self.trimmark(wordlist[10])
3226+ except:
3227+ return CONV_PARSE_ERROR
3228+ if self.is_empty(nodename, exitcode):
3229+ return CONV_ITEM_EMPTY
3230+
3231+ convertedlog = ("Failed to STONITH the Node %s " % (nodename) + "with one local device (exitcode=%s). " % (exitcode) + "Will try to use the next local device.")
3232+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3233+ return CONV_OK
3234+
3235+ '''
3236+ Convert log message which means fence operation timed out.
3237+
3238+ MsgNo.20-4, 21-4)
3239+ Jan 13 14:08:01 x3650a stonithd: [20372]: ERROR: Failed to STONITH the node x3650b: optype=RESET, op_result=TIMEOUT
3240+ '''
3241+ def fence_op_timedout(self, outputobj, logelm, lconvfrm):
3242+ try:
3243+ wordlist = logelm.halogmsg.split()
3244+ nodename = self.trimmark(wordlist[5])
3245+
3246+ oplist = wordlist[6].split('=')
3247+ op = self.trimmark(oplist[1])
3248+ except:
3249+ return CONV_PARSE_ERROR
3250+ if self.is_empty(nodename, op):
3251+ return CONV_ITEM_EMPTY
3252+
3253+ convertedlog = ("Failed to STONITH (%s) " % (op) + "the Node %s (Timed Out)." % (nodename))
3254+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3255+ return CONV_OK
3256+
3257+ ##########
3258+ # For attribute event.
3259+ ##########
3260+ '''
3261+ Convert log message which means attribute value on own node updated.
3262+
3263+ MsgNo.22-1)
3264+ Jun 24 09:49:58 x3650a attrd: [16121]: info: attrd_perform_update: Sent update 45: diskcheck_status_internal=ERROR
3265+ '''
3266+ def detect_attr_updated(self, outputobj, logelm, lconvfrm):
3267+ try:
3268+ # attribute name can has empty char.
3269+ funcname_endpos = logelm.halogmsg.index(':')
3270+ callid_endpos = logelm.halogmsg.index(':', (funcname_endpos + 1))
3271+ attr_and_val = \
3272+ logelm.halogmsg[(callid_endpos + 1):].strip().split('=')
3273+ attrname = attr_and_val[0]
3274+ attrval = attr_and_val[1]
3275+ except:
3276+ return CONV_PARSE_ERROR
3277+ if self.is_empty(attrname, attrval):
3278+ return CONV_ITEM_EMPTY
3279+
3280+ convertedlog = ("Attribute \"%s\" is updated to \"%s\"." %
3281+ (attrname, attrval))
3282+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3283+ return CONV_OK
3284+
3285+ '''
3286+ Convert log message which means attribute value on own node deleted.
3287+
3288+ MsgNo.22-2)
3289+ Jul 15 13:09:34 x3650a attrd: [17459]: info: attrd_perform_update: Sent delete 68: node=410de9dc-4458-4c0f-9d06-e7c8c2f0593e, attr=diskcheck_status, id=<n/a>, set=(null), section=status
3290+ '''
3291+ def detect_attr_deleted(self, outputobj, logelm, lconvfrm):
3292+ try:
3293+ attrname = logelm.halogmsg.split(',')[1].strip().split("=")[1]
3294+ except:
3295+ return CONV_PARSE_ERROR
3296+ if self.is_empty(attrname):
3297+ return CONV_ITEM_EMPTY
3298+
3299+ convertedlog = ("Attribute \"%s\" is deleted." % attrname)
3300+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3301+ return CONV_OK
3302+
3303+ ##########
3304+ # For Heartbeat service starts.
3305+ ##########
3306+ '''
3307+ Heartbeat log message which means Heartbeat service is starting.
3308+
3309+ MsgNo.23-1)
3310+ Jul 15 15:50:31 x3650a heartbeat: [22780]: info: Configuration validated. Starting heartbeat 3.0.3
3311+ '''
3312+ def detect_hb_start(self, outputobj, logelm, lconvfrm):
3313+ try:
3314+ wordlist = logelm.halogmsg.split()
3315+ version = wordlist[-1]
3316+ except:
3317+ return CONV_PARSE_ERROR
3318+ if self.is_empty(version):
3319+ return CONV_ITEM_EMPTY
3320+
3321+ convertedlog = ("Starting Heartbeat %s." % (version))
3322+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3323+ return CONV_OK
3324+
3325+ '''
3326+ Detect localhost status is set to up.
3327+ Then clear all status (exclude ino, and offset).
3328+ The message which is detected by detect_hb_start() appears when
3329+ service Heartbeat start on the node which Heartbeat is already running,
3330+ too.
3331+ So, detect the following message to clear all status.
3332+
3333+ MsgNo.23-3)
3334+ Jul 15 11:12:13 x3650a heartbeat: [17442]: info: Local status now set to: 'up'
3335+ '''
3336+ def detect_localstat_up(self, outputobj, logelm, lconvfrm):
3337+ self.clear_status()
3338+ return CONV_OK
3339+
3340+ ##########
3341+ # For pengine and tengine event.
3342+ ##########
3343+ '''
3344+ Convert log message which means pengine start.
3345+
3346+ MsgNo.29-1)
3347+ Aug 09 14:48:25 x3650a crmd: [5766]: info: start_subsystem: Starting sub-system "pengine"
3348+
3349+
3350+ "crmd[2465]: 2009/06/08_17:36:36 info: start_subsystem:
3351+ Starting sub-system "tengine""
3352+ '''
3353+ def crmd_subsystem_start(self, outputobj, logelm, lconvfrm):
3354+ try:
3355+ sysname = logelm.halogmsg.split()[-1].strip('"')
3356+ except:
3357+ return CONV_PARSE_ERROR
3358+ if self.is_empty(sysname):
3359+ return CONV_ITEM_EMPTY
3360+
3361+ convertedlog = ("Start \"%s\" process." % (sysname))
3362+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3363+ return CONV_OK
3364+
3365+ '''
3366+ Convert log message which means pengine exits.
3367+
3368+ MsgNo.29-2)
3369+ Jul 20 15:48:33 x3650a crmd: [28373]: info: crmdManagedChildDied: Process pengine:[28390] exited (signal=0, exitcode=0)
3370+ '''
3371+ def crmd_subsystem_exit(self, outputobj, logelm, lconvfrm):
3372+ try:
3373+ wordList = logelm.halogmsg.split()
3374+ sys_and_pid = wordList[2].split(':')
3375+ sysname = sys_and_pid[0]
3376+ pid = sys_and_pid[1].lstrip('[').rstrip(']')
3377+ except:
3378+ return CONV_PARSE_ERROR
3379+ if self.is_empty(sysname, pid):
3380+ return CONV_ITEM_EMPTY
3381+
3382+ convertedlog = ("Stop \"%s\" process normally. (pid=%s)" % (sysname, pid))
3383+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3384+ return CONV_OK
3385+
3386+ '''
3387+ Convert log message which means pengine killed by signal.
3388+
3389+ MsgNo.29-3)
3390+ Jul 20 15:48:33 x3650a crmd: [28373]: info: crmdManagedChildDied: Process pengine:[28390] exited (signal=9, exitcode=0)
3391+ '''
3392+ def crmd_subsystem_kill(self, outputobj, logelm, lconvfrm):
3393+ try:
3394+ wordList = logelm.halogmsg.split()
3395+ sys_and_pid = wordList[2].split(':')
3396+ sysname = sys_and_pid[0]
3397+ pid = sys_and_pid[1].lstrip('[').rstrip(']')
3398+ signum = wordList[4].split('=')[1].rstrip(',')
3399+ except:
3400+ return CONV_PARSE_ERROR
3401+ if self.is_empty(sysname, pid, signum):
3402+ return CONV_ITEM_EMPTY
3403+
3404+ convertedlog = ("Managed \"%s\" process terminated with signal %s. (pid=%s)" % (sysname, signum, pid))
3405+ outputobj.output_log(lconvfrm.loglevel, convertedlog)
3406+ return CONV_OK
3407+
3408+ ##########
3409+ # Others.
3410+ ##########
3411+ '''
3412+ Detect a request for getting DC node name and DC status.
3413+ For auto reset function.
3414+
3415+ MsgNo.27-1)
3416+ Jan 6 19:55:28 x3650a crmd: [28183]: info: handle_request: Current ping state: S_IDLE
3417+ '''
3418+ def detect_dcstat_req(self, outputobj, logelm, lconvfrm):
3419+ return CONV_OK
3420+
3421+if __name__ == "__main__":
3422+ pm_log = LogconvLog(LogconvLog.LOG_INFO, None)
3423+ sys.exit(LogConvert().main())
diff -r 000000000000 -r 2d98f677a5a2 pm_logconv.spec
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pm_logconv.spec Wed Oct 06 10:47:39 2010 +0900
@@ -0,0 +1,93 @@
1+########################################
2+# Derived definitions
3+########################################
4+%define name pm_logconv
5+%define cluster hb
6+%define version 1.0
7+%define release 1.el5
8+%define prefix /usr
9+%define instdir pm_logconv
10+%define ORGARCH %{name}-%{version}
11+#
12+#
13+Summary: Pacemaker and Heartbeat log converter
14+Name: %{name}-%{cluster}
15+Version: %{version}
16+Release: %{release}
17+Group: Applications
18+Source: %{name}-%{version}.tar.gz
19+License: GPL
20+Vendor: NIPPON TELEGRAPH AND TELEPHONE CORPORATION
21+BuildRoot: %{_tmppath}/%{name}-%{version}
22+BuildRequires: make
23+BuildArch: noarch
24+Requires: python >= 2.4, python < 3.0
25+Requires: pacemaker >= 1.0.9
26+Requires: heartbeat >= 3.0.3
27+
28+########################################
29+%description
30+Log message converter for Pacemaker and Heartbeat.
31+support version
32+ Pacemaker : stable-1.0 (1.0.9 or more)
33+ Heartbeat : 3.0.3
34+
35+########################################
36+%prep
37+########################################
38+rm -rf $RPM_BUILD_ROOT
39+
40+########################################
41+%setup -q
42+########################################
43+
44+########################################
45+%build
46+########################################
47+
48+########################################
49+%configure
50+########################################
51+
52+########################################
53+%pre
54+########################################
55+
56+########################################
57+%install
58+########################################
59+make DESTDIR=$RPM_BUILD_ROOT install
60+
61+########################################
62+%clean
63+########################################
64+if
65+ [ -n "${RPM_BUILD_ROOT}" -a "${RPM_BUILD_ROOT}" != "/" ]
66+then
67+ rm -rf $RPM_BUILD_ROOT
68+fi
69+rm -rf $RPM_BUILD_DIR/%{ORGARCH}
70+
71+########################################
72+%post
73+########################################
74+true
75+########################################
76+%preun
77+########################################
78+true
79+########################################
80+%postun
81+########################################
82+true
83+
84+########################################
85+%files
86+########################################
87+%defattr(-,root,root)
88+%dir /etc
89+%config /etc/pm_logconv.conf
90+%dir %{prefix}/share/pacemaker/%{instdir}
91+%{prefix}/share/pacemaker/%{instdir}/pm_logconv.py
92+%ghost %{prefix}/share/pacemaker/%{instdir}/pm_logconv.pyc
93+%ghost %{prefix}/share/pacemaker/%{instdir}/pm_logconv.pyo
Show on old repository browser