#!/usr/bin/env python
#
# pollmachine
#
# Monitors build machines and notifies qmgr of changes

#
# pollmachine [options] [arch] ...
#   - update every machine in the mlist file for [arch]
#
# pollmachine [options] [arch/mach] ...
#   - update individual machine(s) for specified architecture
#
# options are:
#   -daemon : poll repeatedly

#
# TODO:
# XXX qmgr notification of new/removed machines
# XXX counter before declaring a machine as dead
# Declares a machine as online if it reports 0 data from infoseek?

# * Deal with machines change OS/kernel version
#     - ACL list might change!
#          - take machine offline, update ACL/arch/etc, reboot, bring online

import sys, threading, socket
from time import sleep
import os, subprocess, logging

if len(sys.argv) < 1:
    print "Usage: %s <arch> [<arch> ...]" % sys.argv[0]
    sys.exit(1)

arches=set()
mlist={}
polldelay=0
for i in sys.argv[1:]:
    if i == "-daemon":
        polldelay = 180
        continue

    if "/" in i:
        item=i.partition("/")
        arch=item[0]
        mach=item[2]
        arches.add(arch)
        try:
            mlist[arch].add(mach)
        except KeyError:
            mlist[arch] = set((mach,))
    else:
        arches.add(i)

pb="/var/portbuild"

# set of machines for each arch
machines={}
for i in arches:
    machines[i]=set()

# Mapping from machine names to monitor threads
pollthreads={}

class MachinePoll(threading.Thread):
    """ Poll a machine regularly """

    mach = None	# Which machine name to poll
    arch = None # Which arch is this assigned to

    # Which host/port to poll for this machine status (might be SSH
    # tunnel endpoint)
    host = None
    port = 414

    timeout = None	# How often to poll
    shutdown = False # Exit at next poll wakeup

    # State variables tracked
    online = False

    # Dictionary of variables reported by the client
    vars = None

    def __init__(self, mach, arch, timeout, host, port):
        super(MachinePoll, self).__init__()
        self.mach = mach
        self.arch = arch
        self.timeout = timeout
        self.host = host
        self.port = port

        # How many times the connection timed out since last success
        self.timeouts = 0 

        self.vars = {}
    
        self.setDaemon(True)

    def run(self):
        while True:
            if self.shutdown:
                break

            self.poll()
            
            if not self.timeout:
                break
            else:
                sleep(self.timeout)

    def poll(self):
        """ Poll the status of this machine """

        nowonline = False
        lines = []
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(60)
            s.connect((self.host, self.port))

            data = ""
            while len(data) < 65536:
                chunk = s.recv(8192)
                if not chunk:
                    break
                data += chunk

            nowonline = True
            self.timeouts = 0
            lines = data.split("\n")
        except socket.timeout:
            if self.online:
                logging.info("[%s] Connection timeout" % self.mach)
            self.timeouts += 1
            if self.timeouts < 3:
                nowonline = self.online
        except:
            pass
	finally:
            try:
                s.close()
            except:
                pass

        if nowonline != self.online:
            logging.info("[%s] Now %s" % (self.mach, "online" if nowonline else "OFFLINE"))
            self.online = nowonline
	    if self.online:
		self.timeouts = 0
            # XXX inform qmgr of state change

        if self.online and not lines and not self.timeouts:
            # reportload script is missing
            dosetup=1
        else:
            dosetup=0

        for line in lines:
            if line == "":
                continue
            line=line.rstrip()
            part=line.partition('=')
            if part[1] != '=' or not part[0]:
#                if "No such file or directory" in line:
#                    # Client may require setting up post-boot
#                    dosetup=1
                logging.info("[%s] Bad input: %s" % (self.mach, line))
                # Assume client needs setting up
                dosetup=1
            try:
                old = self.vars[part[0]]
            except KeyError:
                old = ""
            if old != part[2]:
                self.vars[part[0]] = part[2]
#                logging.info("%s@%s: \"%s\" -> \"%s\"" % (part[0], self.mach, old, part[2]))
                # XXX update qmgr

        try:
            envs = self.vars['buildenvs']
            for e in envs.split():
                (arch, branch, buildid) = e.split("/")
                f = "/var/portbuild/%s/%s/builds/%s/.active" % \
                    (arch, branch, buildid)
                if os.path.exists(f):
                    continue
                # Clean up a stale buildenv
                logging.info("[%s] Cleaning up stale build: %s" % (self.mach, e))
                (err, out) = self.setup(branch, buildid, "-nocopy -full")
                if err:
                    logging.info("[%s] Error from cleanup" % (self.mach))
                for l in out.split("\n"):
                    if l == "":
                        continue
                    logging.info("[%s] %s" % (self.mach, l))

        except KeyError:
            pass

        if dosetup:
            logging.info("[%s] Setting up machine" % (self.mach))
            (err, out) = self.setup("-", "-")
            if err:
                logging.info("[%s] Error from setup" % (self.mach))
                for l in out.split("\n"):
                    if l == "":
                        continue
                    logging.info("[%s] %s" % (self.mach, l))
            logging.info("[%s] Setup complete" % (self.mach))
                            
        # Validate that arch has not changed (e.g. i386 -> amd64)
        try:
            if self.arch != self.vars['arch']:
                logging.info("[%s] Unexpected arch: %s -> %s" % \
                    (self.mach, self.arch, self.vars['arch']))
        except KeyError:
            pass

        # Record current system load
        try:
            f = file("%s/%s/loads/%s" % (pb, self.arch, self.mach), "w")
        except:
            return
        try:
            f.write("%s %s\n" % (self.vars['jobs'], self.vars['load']))
        except:
            pass
        f.close()

    def setup(self, branch, buildid, args = ""):
        cmd = "su ports-%s -c \"/var/portbuild/scripts/dosetupnode %s %s %s %s %s\""\
            % (self.arch, self.arch, branch, buildid, self.mach, args)
        child = subprocess.Popen(cmd, shell=True, stderr = subprocess.STDOUT,
                                 stdout = subprocess.PIPE)
        err = child.wait()
        out = "".join(child.stdout.readlines())
        return (err, out)

logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)s] %(message)s',
                    datefmt='%d %b %Y %H:%M:%S',
                    filename='/var/log/pollmachine.log', filemode='w') 

log_console = logging.StreamHandler()
log_console.setLevel(logging.INFO)
formatter = logging.Formatter('[%(asctime)s] %(message)s',
                              datefmt = '%d %b %Y %H:%M:%S')
log_console.setFormatter(formatter) 
logging.getLogger('').addHandler(log_console) 

while True:
    for arch in arches:
        try:
            now = mlist[arch]
        except KeyError:
            mlistfile="%s/%s/mlist" % (pb, arch)
            try:
                f = file(mlistfile, "r")
            except OSError, error:
                raise

            now=set(mach.rstrip() for mach in f.readlines())
            f.close()

        gone = machines[arch].difference(now)
        new = now.difference(machines[arch])

        machines[arch]=now

        for mach in gone:
            logging.info("Removing machine %s/%s" % (arch, mach))
            # XXX disable from qmgr
            pollthreads[mach].shutdown=True
            del pollthreads[mach]

        for mach in new:
            logging.info("Adding machine %s/%s" % (arch, mach))
            # XXX set up qmgr

            pc="%s/%s/portbuild.conf" % (pb, arch)
            pch="%s/%s/portbuild.%s" % (pb, arch, mach)
            cmd = "test -f %s && . %s; test -f %s && . %s; echo $infoseek_host; echo $infoseek_port" % (pc, pc, pch, pch)
            config = subprocess.Popen(cmd, shell = True,
                                      stdout = subprocess.PIPE)
            host=config.stdout.readline().rstrip()
            if not host:
                host = mach
            port=config.stdout.readline().rstrip()
            try:
                port = int(port)
            except (TypeError, ValueError):
                port = 414

            pollthreads[mach] = MachinePoll(mach, arch, polldelay, host, port)
            pollthreads[mach].start()

    if not polldelay:
        break

    sleep(polldelay)
