#Script to monitor running cluster #will write thread dumps and other cluster diagnostic info to a file, at regular intervals #Vernetto 2011-11-11 from java.io import File from java.io import FileOutputStream from java.io import FileInputStream from java.util import HashMap from java.util import HashSet from java.util import ArrayList from time import sleep import datetime import zlib import zipfile from os import * configFileProperties = None isConnected = false previousStdout = None #======================================================================================= # Utility function to load properties from a config file #======================================================================================= def loadProps(configPropFile): global configFileProperties propInputStream = FileInputStream(configPropFile) configFileProperties = Properties() configFileProperties.load(propInputStream) def appendToAlarmLog(alarmmessage): alarmsfile = configFileProperties.get('alarmsfile') alarmFile = open(alarmsfile, 'a') alarmFile.write(getNowTimestamp() + ' ' + alarmmessage + '\n') alarmFile.close() def getNowTimestamp(): now = datetime.datetime.now() nowtimestamp = now.strftime("%Y%m%dT%H%M%S") return nowtimestamp def monitorServer(): global isConnected fos = None previousStdout = theInterpreter.getOut() try: #initialize variables serverURL=configFileProperties.get('serverURL') serverName=configFileProperties.get('serverName') username=configFileProperties.get('username') password=configFileProperties.get('password') logFile=configFileProperties.get('logFile') aliveServerCountExpected=configFileProperties.get('aliveServerCountExpected') interval=configFileProperties.get('interval') #initialize timestamp to append to log filename nowtimestamp = getNowTimestamp() #save stdout handle logFileNameWithTimestamp = logFile + '.' + nowtimestamp #set new stdout print "start the script" #easeSyntax() if (not isConnected) : connect(username, password, serverURL) isConnected = true serverRuntime() #redirect output to log file f = File(logFileNameWithTimestamp) fos = FileOutputStream(f) theInterpreter.setOut(fos) cd('/') print 'Health', cmo.getHealthState() cd('/ClusterRuntime/myCluster/') print 'AliveServerCount=', cmo.getAliveServerCount(), ' ServerNames', cmo.getServerNames() if (aliveServerCountExpected != cmo.getAliveServerCount()) : alarmmessage = 'MONITORALARM, we were expecting AliveServerCount ' + aliveServerCountExpected + ' and we have instead %d , see file %s' % (cmo.getAliveServerCount() , logFileNameWithTimestamp, ) print alarmmessage appendToAlarmLog(alarmmessage) cd('/ClusterRuntime/myCluster/UnicastMessaging/UnicastMessagingRuntime') print 'DiscoveredGroupLeaders=', cmo.getDiscoveredGroupLeaders(), ' Groups=', cmo.getGroups(), ' LocalGroupLeaderName=', cmo.getLocalGroupLeaderName(), ' RemoteGroupsDiscoveredCount=', cmo.getRemoteGroupsDiscoveredCount(), ' TotalGroupsCount=', cmo.getTotalGroupsCount() cd('/ServerChannelRuntimes/unicastChannel') print 'AcceptCount=' , cmo.getAcceptCount() , ' MessagesReceivedCount=' , cmo.getMessagesReceivedCount() , ' MessagesSentCount=' , cmo.getMessagesSentCount() scr = cmo.getServerConnectionRuntimes() #scr is an array of weblogic.server.channels.ServerConnectionRuntimeImpl$SerializableConnectionRuntime for myscr in scr: print "BytesReceivedCount=", myscr.getBytesReceivedCount(), " BytesSentCount=", myscr.getBytesSentCount(), " ConnectTime=", myscr.getConnectTime()," MessagesReceivedCount=", myscr.getMessagesReceivedCount(), " MessagesSentCount=", myscr.getMessagesSentCount() print "" threadDump() cd('/JVMRuntime/' + serverName) #this is valid for JRockit #print "HeapFreeCurrent=", cmo.getHeapFreeCurrent(), " TotalGarbageCollectionTime", cmo.getTotalGarbageCollectionTime(), " TotalNumberOfThreads=", cmo.getTotalNumberOfThreads() #this is valid for JRockit print "HeapFreeCurrent=", cmo.getHeapFreeCurrent(), ' HeapSizeCurrent=', cmo.getHeapSizeCurrent() cd('/ThreadPoolRuntime/ThreadPoolRuntime') print 'HoggingThreadCount=', cmo.getHoggingThreadCount(), ' PendingUserRequestCount', cmo.getPendingUserRequestCount(), ' StandbyThreadCount' , cmo.getStandbyThreadCount() print 'CompletedRequestCount=', cmo.getCompletedRequestCount(), ' ExecuteThreadIdleCount=', cmo.getExecuteThreadIdleCount(), ' ExecuteThreadTotalCount=', cmo.getExecuteThreadTotalCount() #restore stdout theInterpreter.setOut(previousStdout) fos.close() #now zip the report zipfileLog = zipfile.ZipFile(logFileNameWithTimestamp + '.zip', 'w') zipfileLog.write(logFileNameWithTimestamp, compress_type=zipfile.ZIP_DEFLATED) zipfileLog.close() os.remove(logFileNameWithTimestamp) except: isConnected = false theInterpreter.setOut(previousStdout) if (fos != None) : fos.close() print "Unexpected error:", sys.exc_info()[0] raise # monitor script init try: # sys.argv[1] is the config properties file configFile = sys.argv[1] print 'Loading config from :', configFile loadProps(configFile) interval = configFileProperties.getProperty('interval') while True: try: sleep(float(interval)) monitorServer() except: errorMessage = "ERROR_QUERYING_SERVER %s - %s - %s"% (sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2], ) appendToAlarmLog(errorMessage) dumpStack() except: print "Unexpected error: ", sys.exc_info()[0] dumpStack() raise
and the property file contains:
serverURL=t3://pierrepc:7031
serverName=ms3
username=weblogic
password=welcome1
logFile=C:/pierre/clustermonitor/clustermonitorinfo3.log
alarmsfile=C:/pierre/clustermonitor/clustermonitoralarms3.log
aliveServerCountExpected=3
interval=10
No comments:
Post a Comment