#!/usr/bin/env python2.3 # # (C) 2001 by Argonne National Laboratory. # See COPYRIGHT in top-level directory. # """ mpdcheck This script is a work in progress and may change frequently as we work with users and gain additional insights into how to improve it. This script prints useful information about the host on which it runs. It is here to help us help users detect problems with configurations of their computers. For example, some computers are configured to think of themselves simply as 'localhost' with 127.0.0.1 as the IP address. This might present problems if a process on that computer wishes to identify itself by host and port to a process on another computer. The process on the other computer would try to contact 'localhost'. If you are having problems running parallel jobs via mpd on one or more hosts, you might try running this script once on each of those hosts. Any output with *** at the beginning indicates a potential problem that you may have to resolve before being able to run parallel jobs via mpd. For help: mpdcheck -h (or --help) prints this message In the following modes, the -v (verbose) option provides info about what mpdcheck is doing; the -l (long messages) option causes long informational messages to print in situations where problems are spotted. The three major modes of operation for this program are: mpdcheck looks for config problems on 'this' host; prints as nec mpdcheck -pc print config info about 'this' host, e.g. contents of /etc/hosts, etc. mpdcheck -f some_file [-ssh] prints info about 'this' host and locatability info about the ones listed in some_file as well (note the file might be mpd.hosts); the -ssh option can be used in conjunction with the -f option to cause ssh tests to be run to each remote host mpdcheck -s runs this program as a server on one host mpdcheck -c server_host server_port runs a client on another (or same) host; connects to the specifed host/port where you previously started the server """ from time import ctime __author__ = "Ralph Butler and Rusty Lusk" __date__ = ctime() __version__ = "$Revision: 1.19 $" __credits__ = "" import re from sys import argv, exit, stdout from os import path, kill, system from signal import SIGKILL from socket import gethostname, getfqdn, gethostbyname_ex, gethostbyaddr, socket from popen2 import Popen3 from select import select, error from commands import getoutput if __name__ == '__main__': # so I can be imported by pydoc do_ssh = 0 fullDirName = path.abspath(path.split(argv[0])[0]) # normalize hostsFromFile = [] verbose = 0 long_messages = 0 argidx = 1 while argidx < len(argv): if argv[argidx] == '-h' or argv[argidx] == '--help': print __doc__ exit(0) elif argv[argidx] == '-s': lsock = socket() lsock.bind(('',0)) # anonymous port lsock.listen(5) print "server listening at INADDR_ANY on: %s %s" % (gethostname(),lsock.getsockname()[1]) stdout.flush() (tsock,taddr) = lsock.accept() print "server has conn on %s from %s" % (tsock,taddr) msg = tsock.recv(64) if not msg: print "*** server failed to recv msg from client" else: print "server successfully recvd msg from client: %s" % (msg) tsock.sendall('ack_from_server_to_client') tsock.close() lsock.close() exit(0) elif argv[argidx] == '-c': sock = socket() sock.connect((argv[argidx+1],int(argv[argidx+2]))) # note double parens sock.sendall('hello_from_client_to_server') msg = sock.recv(64) if not msg: print "*** client failed to recv ack from server" else: print "client successfully recvd ack from server: %s" % (msg) stdout.flush() sock.close() exit(0) elif argv[argidx] == '-pc': print "--- print results of: gethostbyname_ex(gethostname())" print gethostbyname_ex(gethostname()) print "--- try to run /bin/hostname" linesAsStr = getoutput("/bin/hostname") print linesAsStr print "--- try to run uname -a" linesAsStr = getoutput("/bin/uname -a") print linesAsStr print "--- try to print /etc/hosts" linesAsStr = getoutput("/bin/cat /etc/hosts") print linesAsStr print "--- try to print /etc/resolv.conf" linesAsStr = getoutput("/bin/cat /etc/resolv.conf") print linesAsStr print "--- try to run /sbin/ifconfig -a" linesAsStr = getoutput("/sbin/ifconfig -a") print linesAsStr print "--- try to print /etc/nsswitch.conf" linesAsStr = getoutput("/bin/cat /etc/nsswitch.conf") print linesAsStr exit(0) elif argv[argidx] == '-v': verbose = 1 argidx += 1 elif argv[argidx] == '-l': long_messages = 1 argidx += 1 elif argv[argidx] == '-f': try: hostsFile = open(argv[argidx+1]) except: print 'unable to open file ', argv[argidx+1] exit(-1) for line in hostsFile: line = line.rstrip() if not line or line[0] == '#': continue splitLine = re.split(r'\s+',line) host = splitLine[0] if ':' in host: (host,ncpus) = host.split(':') hostsFromFile.append(host) argidx += 2 elif argv[argidx] == '-ssh': do_ssh = 1 argidx += 1 else: print 'unrecognized arg:', argv[argidx] exit(0) # See if we can do gethostXXX, etc. for this host if verbose: print 'obtaining hostname via gethostname and getfqdn' uqhn1 = gethostname() fqhn1 = getfqdn() if verbose: print "gethostname gives ", uqhn1 print "getfqdn gives ", fqhn1 if verbose: print 'checking out unqualified hostname; make sure is not "localhost", etc.' if uqhn1.startswith('localhost'): if long_messages: msg = """ ********** The unqualified hostname seems to be localhost. This generally means that the machine's hostname is not set. You may change it by using the 'hostname' command, e.g.: hostname mybox1 However, this will not remain after a reboot. To do this, you will need to consult the operating system's documentation. On Debian Linux systems, this can be done by: echo "mybox1" > /etc/hostname ********** """ else: msg = "*** the uq hostname seems to be localhost" print msg.strip().replace(' ','') elif uqhn1 == '': if long_messages: msg = """ ********** The unqualified hostname seems to be blank. This generally means that the machine's hostname is not set. You may change it by using the 'hostname' command, e.g.: hostname mybox1 However, this will not remain after a reboot. To do this, you will need to consult the operating system's documentation. On Debian Linux systems, this can be done by: echo "mybox1" > /etc/hostname ********** """ else: msg = "*** the uq hostname seems to be localhost" print msg.replace(' ','') if verbose: print 'checking out qualified hostname; make sure is not "localhost", etc.' if fqhn1.startswith('localhost'): if long_messages: msg = """ ********** Your fully qualified hostname seems to be set to 'localhost'. This generally means that your machine's /etc/hosts file contains a line similar to this: 127.0.0.1 mybox1 localhost.localdomain localhost You probably want to remove your hostname from this line and place it on a line by itself with your ipaddress, like this: $ipaddr mybox1 ********** """ else: msg = "*** the fq hostname seems to be localhost" print msg.rstrip().replace(' ','') elif fqhn1 == '': if long_messages: msg = """ ********** Your fully qualified hostname seems to be blank. ********** """ else: msg = "*** the fq hostname is blank" print msg.replace(' ','') if verbose: print 'obtain IP addrs via qualified and unqualified hostnames;', print ' make sure other than 127.0.0.1' uipaddr1 = 0 try: ghbnu = gethostbyname_ex(uqhn1) if verbose: print "gethostbyname_ex: ", ghbnu uipaddr1 = ghbnu[2][0] if uipaddr1.startswith('127'): if long_messages: msg = """ ********** Your unqualified hostname resolves to 127.0.0.1, which is the IP address reserved for localhost. This likely means that you have a line similar to this one in your /etc/hosts file: 127.0.0.1 $uqhn This should perhaps be changed to the following: 127.0.0.1 localhost.localdomain localhost ********** """ else: msg = "*** first ipaddr for this host (via %s) is: %s" % (uqhn1,uipaddr1) print msg.replace(' ','') try: ghbau = gethostbyaddr(uipaddr1) except: print "*** gethostbyaddr failed for this hosts's IP %s" % (uipaddr1) except: if long_messages: msg = """ ********** The system call gethostbyname(3) failed to resolve your unqualified hostname, or $uqhn. This can be caused by missing info from your /etc/hosts file or your system not having correctly configured name resolvers, or by your IP address not existing in resolution services. If you run DNS, you may wish to make sure that your DNS server has the correct forward A set up for yout machine's hostname. If you are not using DNS and are only using hosts files, please check that a line similar to the one below exists in your /etc/hosts file: $ipaddr $uqdn If you plan to use DNS but you are not sure that it is correctly configured, please check that the file /etc/resolv.conf contains entries similar to the following: nameserver 1.2.3.4 where 1.2.3.4 is an actual IP of one of your nameservers. ********** """ else: msg = "*** gethostbyname_ex failed for this host %s" % (uqhn1) print msg.replace(' ','') fipaddr1 = 0 try: ghbnf = gethostbyname_ex(fqhn1) if verbose: print "gethostbyname_ex: ", ghbnf fipaddr1 = ghbnf[2][0] if fipaddr1.startswith('127'): msg = """ ********** Your fully qualified hostname resolves to 127.0.0.1, which is the IP address reserved for localhost. This likely means that you have a line similar to this one in your /etc/hosts file: 127.0.0.1 $fqhn This should be perhaps changed to the following: 127.0.0.1 localhost.localdomain localhost ********** """ try: ghbaf = gethostbyaddr(fipaddr1) except: print "*** gethostbyaddr failed for this hosts's IP %s" % (uipaddr1) except: if long_messages: msg = """ ********** The system call gethostbyname(3) failed to resolve your fully qualified hostname, or $fqhn. This can be caused by missing info from your /etc/hosts file or your system not having correctly configured name resolvers, or by your IP address not existing in resolution services. If you run DNS, please check and make sure that your DNS server has the correct forward A record set up for yout machine's hostname. If you are not using DNS and are only using hosts files, please check that a line similar to the one below exists in your /etc/hosts file: $ipaddr $fqhn If you intend to use DNS but you are not sure that it is correctly configured, please check that the file /etc/resolv.conf contains entries similar to the following: nameserver 1.2.3.4 where 1.2.3.4 is an actual IP of one of your nameservers. ********** """ else: msg = "*** gethostbyname_ex failed for host %s" % (fqhn1) print msg.replace(' ','') if verbose: print 'checking that IP addrs resolve to same host' if uipaddr1 and fipaddr1 and uipaddr1 != fipaddr1: msg = """ ********** Your fully qualified and unqualified names do not resolve to the same IP. This likely means that your DNS domain name is not set correctly. This might be fixed by adding a line similar to the following to your /etc/hosts: $ipaddr $fqhn $uqdn ********** """ print msg.replace(' ','') if verbose: print 'now do some gethostbyaddr and gethostbyname_ex for machines in hosts file' # See if we can do gethostXXX, etc. for hosts in hostsFromFile for host in hostsFromFile: uqhn2 = host fqhn2 = getfqdn(uqhn2) uipaddr2 = 0 if verbose: print 'checking gethostbyXXX for unqualified %s' % (uqhn2) try: ghbnu = gethostbyname_ex(uqhn2) if verbose: print "gethostbyname_ex: ", ghbnu uipaddr2 = ghbnu[2][0] try: ghbau = gethostbyaddr(uipaddr2) except: print "*** gethostbyaddr failed for remote hosts's IP %s" % (fipaddr2) except: print "*** gethostbyname_ex failed for host %s" % (fqhn2) if verbose: print 'checking gethostbyXXX for qualified %s' % (uqhn2) try: ghbnf = gethostbyname_ex(fqhn2) if verbose: print "gethostbyname_ex: ", ghbnf fipaddr2 = ghbnf[2][0] if uipaddr2 and fipaddr2 != uipaddr2: print "*** ipaddr via uqn (%s) does not match via fqn (%s)" % (uipaddr2,fipaddr2) try: ghbaf = gethostbyaddr(fipaddr2) except: print "*** gethostbyaddr failed for remote hosts's IP %s" % (fipaddr2) except: print "*** gethostbyname_ex failed for host %s" % (fqhn2) # see if we can run /bin/date on remote hosts if not do_ssh: exit(0) for host in hostsFromFile: cmd = "ssh %s -x -n /bin/echo hello" % (host) if verbose: print 'trying: %s' % (cmd) runner = Popen3(cmd,1,0) runout = runner.fromchild runerr = runner.childerr runin = runner.tochild runpid = runner.pid try: (readyFDs,unused1,unused2) = select([runout],[],[],9) except Exception, data: print 'select 1 error: %s ; %s' % ( data.__class__, data) exit(-1) if len(readyFDs) == 0: print '** ssh timed out to %s' % (host) line = '' failed = 0 if runout in readyFDs: line = runout.readline() if not line.startswith('hello'): failed = 1 else: failed = 1 if failed: print '** ssh failed to %s' % (host) print '** here is the output:' if line: print line, done = 0 fds = [runout,runerr] while not done: try: (readyFDs,unused1,unused2) = select(fds,[],[],1) except Exception, data: print 'select 2 error: %s ; %s' % ( data.__class__, data) exit(-1) if runout in readyFDs: line = runout.readline() if line: print line, else: fds.remove(runout) elif runerr in readyFDs: line = runerr.readline() if line: print line, else: fds.remove(runerr) else: done = 1 try: kill(runpid,SIGKILL) runout.close() runerr.close() runin.close() except: pass if failed: exit(-1) # see if we can run mpdcheck on remote hosts for host in hostsFromFile: cmd1 = path.join(fullDirName,'mpdcheck.py') + ' -s' if verbose: print 'starting server: %s' % (cmd1) runner1 = Popen3(cmd1,1,0) runout1 = runner1.fromchild runerr1 = runner1.childerr runin1 = runner1.tochild runpid1 = runner1.pid try: (readyFDs,unused1,unused2) = select([runout1],[],[],9) except Exception, data: print 'select 3 error: %s ; %s' % ( data.__class__, data) exit(-1) if len(readyFDs) == 0: print '** timed out waiting for local server to produce output' line = '' failed = 0 port = 0 if runout1 in readyFDs: line = runout1.readline() if line.startswith('server listening at '): port = line.rstrip().split(' ')[-1] else: failed = 1 else: failed = 1 if failed: print 'could not start mpdcheck server' print 'here is the output:' if line: print line, done = 0 fds = [runout1,runerr1] while not done: try: (readyFDs,unused1,unused2) = select(fds,[],[],1) except Exception, data: print 'select 4 error: %s ; %s' % ( data.__class__, data) exit(-1) if runout in readyFDs: line = runout.readline() if line: print line, else: fds.remove(runout) elif runerr in readyFDs: line = runerr.readline() if line: print line, else: fds.remove(runerr) else: done = 1 if failed: try: kill(runpid1,SIGKILL) except: pass exit(-1) cmd2 = "ssh %s -x -n %s%smpdcheck.py -c %s %s" % (host,fullDirName,path.sep,fqhn1,port) if verbose: print 'starting client: %s' % (cmd2) runner2 = Popen3(cmd2,1,0) runout2 = runner2.fromchild runerr2 = runner2.childerr runin2 = runner2.tochild runpid2 = runner2.pid try: (readyFDs,unused1,unused2) = select([runout2],[],[],9) except Exception, data: print 'select 3 error: %s ; %s' % ( data.__class__, data) exit(-1) if len(readyFDs) == 0: print '** timed out waiting for client on %s to produce output' % (host) line = '' failed = 0 port = 0 if runout2 in readyFDs: line = runout2.readline() if not line.startswith('client successfully recvd'): failed = 1 else: failed = 1 if failed: print 'client on %s failed to access the server' % (host) print 'here is the output:' if line: print line, done = 0 fds = [runout2,runerr2] while not done: try: (readyFDs,unused1,unused2) = select(fds,[],[],1) except Exception, data: print 'select 4 error: %s ; %s' % ( data.__class__, data) exit(-1) if runout2 in readyFDs: line = runout2.readline() if line: print line, else: fds.remove(runout2) elif runerr2 in readyFDs: line = runerr2.readline() if line: print line, else: fds.remove(runerr2) else: done = 1 try: kill(runpid2,SIGKILL) except: pass if failed: try: kill(runpid1,SIGKILL) except: pass exit(-1)